From c91b76a6686bf15d1d934dfed5958c2820d457fb Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 31 Aug 2023 13:20:17 +0300 Subject: [PATCH 001/237] Add globals required by intra avx2 code. Remove unnecessary stuff from intra avx2. --- src/strategies/avx2/intra-avx2.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 838bad91..b865efa3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -34,13 +34,19 @@ #if COMPILE_INTEL_AVX2 && defined X86_64 #include "uvg266.h" +#include "cu.h" +#include "tables.h" #if UVG_BIT_DEPTH == 8 #include #include +#include +#include -#include "strategyselector.h" -#include "strategies/missing-intel-intrinsics.h" +#include "intra-avx2.h" + + #include "strategyselector.h" + #include "strategies/missing-intel-intrinsics.h" /** * \brief Generate angular predictions. @@ -1066,8 +1072,10 @@ static void uvg_pdpc_planar_dc_avx2( } } -#endif //UVG_BIT_DEPTH == 8 -#endif //COMPILE_INTEL_AVX2 && defined X86_64 +#endif // UVG_BIT_DEPTH == 8 + +#endif // COMPILE_INTEL_AVX2 && defined X86_64 + int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) { From e888be80e2277faf2574d3ec80c56523919dd911 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 7 Sep 2023 16:08:26 +0300 Subject: [PATCH 002/237] Add intra avx2 planar placeholder functions. Implement 8xN planar prediction. Note: does not work with height < 4 yet. Initial plan is to produce the planar prediction as two halves. This is subject to change at at this point, it seems only planar functions for different widths are needed. --- src/strategies/avx2/intra-avx2.c | 138 ++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b865efa3..7484cb0c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -515,7 +515,7 @@ static void uvg_angular_pred_avx2( * \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. * \param dst Buffer of size width*width. */ -static void uvg_intra_pred_planar_avx2( +static void uvg_intra_pred_planar_avx2_old( const cu_loc_t* const cu_loc, color_t color, const uint8_t *const ref_top, @@ -616,6 +616,142 @@ static void uvg_intra_pred_planar_avx2( } } + +typedef void (intra_planar_half_func)(const uvg_pixel* ref, const int line, const int shift, __m256i* dst); + +// w1 and w2 for planar horizontal do not exist, since intra prediction must be at least of width 4 +// Also worth noting is that minimum amount of samples must be 16, +// therefore the smallest possible predictions are 4x4, 8x2 and 16x1 +static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi16(ref[8 + 1]); + + __m256i v_ref_coeff = _mm256_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0); + __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + + __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + + for (int i = 0, d = 0; i < line; i += 2, ++d) { + // Handle 2 lines at a time + __m128i v_ref0 = _mm_set1_epi16(ref[i + 1]); + __m128i v_ref1 = _mm_set1_epi16(ref[i + 2]); + + __m256i v_ref = _mm256_castsi128_si256(v_ref0); + v_ref = _mm256_inserti128_si256(v_ref, v_ref1, 1); + + __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); + + v_tmp = _mm256_add_epi16(v_last_ref_mul, v_tmp); + dst[d] = _mm256_slli_epi16(v_tmp, shift); + } +} +static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} + +static void intra_pred_planar_ver_w1(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w2(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + + // Got eight 8-bit samples, or 64 bits of data. Duplicate to fill a whole 256-bit vector. + const __m128i v_ref_raw = _mm_load_si128((const __m128i*)&ref[1]); + __m256i v_ref = _mm256_castsi128_si256(v_ref_raw); + v_ref = _mm256_inserti128_si256(v_ref, v_ref_raw, 1); + v_ref = _mm256_shuffle_epi32(v_ref, _MM_SHUFFLE(1, 1, 0, 0)); + + // Handle 4 lines at a time, unless line == 2 + for (int y = 0, d = 0; y < line; y += 4, d += 2) { + const int a1 = line - 1 - (y + 0); + const int b1 = (y + 0) + 1; + const int a2 = line - 1 - (y + 1); + const int b2 = (y + 1) + 1; + const int a3 = line - 1 - (y + 2); + const int b3 = (y + 2) + 1; + const int a4 = line - 1 - (y + 3); + const int b4 = (y + 3) + 1; + __m256i v_ys = _mm256_setr_epi8(a1, b1, a1, b1, a1, b1, a1, b1, + a2, b2, a2, b2, a2, b2, a2, b2, + a3, b3, a3, b3, a3, b3, a3, b3, + a4, b4, a4, b4, a4, b4, a4, b4); // TODO: these could be loaded from a table + __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); + __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); + + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); + __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); + v_madd_lo = _mm256_slli_epi16(v_madd_lo, shift); + v_madd_hi = _mm256_slli_epi16(v_madd_hi, shift); + __m256i v_tmp0 = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); + __m256i v_tmp1 = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); + + dst[d + 0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); + dst[d + 1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); + } + + //__m256i v_tmp = _mm256_maddubs_epi16 + +} +static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} + + +static intra_planar_half_func* planar_func_table[2][6] = { + { NULL, NULL, intra_pred_planar_hor_w4, intra_pred_planar_hor_w8, intra_pred_planar_hor_w16, intra_pred_planar_hor_w32,}, + {intra_pred_planar_ver_w1, intra_pred_planar_ver_w2, intra_pred_planar_ver_w4, intra_pred_planar_ver_w8, intra_pred_planar_ver_w16, intra_pred_planar_ver_w32,} +}; + + +void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, + color_t color, + const uint8_t* const ref_top, + const uint8_t* const ref_left, + uint8_t* const dst) +{ + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int samples = width * height; + const __m256i v_samples = _mm256_set1_epi16(samples); + + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + const int shift_r = log2_width + log2_height + 1; + + __m256i v_pred_hor[64]; + __m256i v_pred_ver[64]; + + intra_planar_half_func* planar_hor = planar_func_table[0][log2_width]; + intra_planar_half_func* planar_ver = planar_func_table[1][log2_width]; + + planar_hor(ref_left, height, log2_height, v_pred_hor); + planar_ver(ref_top, height, log2_width, v_pred_ver); + + // debug + int16_t* hor_res = (int16_t*)v_pred_hor; + int16_t* ver_res = (int16_t*)v_pred_ver; + + __m256i v_res[64]; + for (int i = 0, d = 0; i < samples; i += 16, ++d) { + v_res[d] = _mm256_add_epi16(v_pred_ver[d], v_pred_hor[d]); + v_res[d] = _mm256_add_epi16(v_res[d], v_samples); + v_res[d] = _mm256_srli_epi16(v_res[d], shift_r); + } + + // debug + int16_t* res = (int16_t*)v_res; + + /*if (samples == 16) { + + } + else { + for (int i = 0, s = 0; i < samples; i += 16, s += 2) { + _mm256_store_si256((__m256i*)dst[i], _mm256_packus_epi16(v_res[s + 0], v_res[s + 1])); + } + }*/ +} + + // Calculate the DC value for a 4x4 block. The algorithm uses slightly // different addends, multipliers etc for different pixels in the block, // but for a fixed-size implementation one vector wide, all the weights, From 4bbd5f4c9a0a8cfc97db734343e74eb5e45616f7 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 7 Sep 2023 18:00:08 +0300 Subject: [PATCH 003/237] Implement 4xN planar prediction. --- src/strategies/avx2/intra-avx2.c | 54 +++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7484cb0c..5ae1440b 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -622,7 +622,26 @@ typedef void (intra_planar_half_func)(const uvg_pixel* ref, const int line, cons // w1 and w2 for planar horizontal do not exist, since intra prediction must be at least of width 4 // Also worth noting is that minimum amount of samples must be 16, // therefore the smallest possible predictions are 4x4, 8x2 and 16x1 -static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi16(ref[4 + 1]); + + __m256i v_ref_coeff = _mm256_setr_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4); + + __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + + for (int i = 0, d = 0; i < line; i += 4, ++d) { + // Handle 4 lines at a time + __m256i v_ref = _mm256_setr_epi16(ref[i + 1], ref[i + 1], ref[i + 1], ref[i + 1], ref[i + 2], ref[i + 2], ref[i + 2], ref[i + 2], + ref[i + 3], ref[i + 3], ref[i + 3], ref[i + 3], ref[i + 4], ref[i + 4], ref[i + 4], ref[i + 4]); + + __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); + + v_tmp = _mm256_add_epi16(v_last_ref_mul, v_tmp); + dst[d] = _mm256_slli_epi16(v_tmp, shift); + } +} static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { const __m256i v_last_ref = _mm256_set1_epi16(ref[8 + 1]); @@ -651,7 +670,35 @@ static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, cons static void intra_pred_planar_ver_w1(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} static void intra_pred_planar_ver_w2(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} -static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + + // Got four 8-bit references, or 32 bits of data. Duplicate to fill a whole 256-bit vector. + const uint32_t* tmp = (const uint32_t*)&ref[1]; // Cast to 32 bit int to load 4 refs at the same time + const __m256i v_ref = _mm256_set1_epi32(*tmp); + + // Handle 4 lines at a time + for (int y = 0, d = 0; y < line; y += 4, ++d) { + const int a1 = line - 1 - (y + 0); + const int a2 = line - 1 - (y + 1); + const int a3 = line - 1 - (y + 2); + const int a4 = line - 1 - (y + 3); + const int b1 = (y + 0) + 1; + const int b2 = (y + 1) + 1; + const int b3 = (y + 2) + 1; + const int b4 = (y + 3) + 1; + + __m256i v_ys = _mm256_setr_epi8(a1, b1, a1, b1, a1, b1, a1, b1, + a2, b2, a2, b2, a2, b2, a2, b2, + a3, b3, a3, b3, a3, b3, a3, b3, + a4, b4, a4, b4, a4, b4, a4, b4); // TODO: these could be loaded from a table + __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); + + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); + dst[d] = _mm256_slli_epi16(v_madd_lo, shift); + } +} static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); @@ -689,9 +736,6 @@ static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const dst[d + 0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); dst[d + 1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); } - - //__m256i v_tmp = _mm256_maddubs_epi16 - } static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} From 4ae234ef247ff9e2d350ce3a3173d43420ed661a Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 7 Sep 2023 21:02:26 +0300 Subject: [PATCH 004/237] Implement 16xN planar prediction. --- src/strategies/avx2/intra-avx2.c | 50 ++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5ae1440b..a71d9de3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -665,7 +665,24 @@ static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const dst[d] = _mm256_slli_epi16(v_tmp, shift); } } -static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi16(ref[16 + 1]); + + __m256i v_ref_coeff = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + + __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + + for (int i = 0, d = 0; i < line; ++i, ++d) { + __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); + + __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); // TODO: the result is needed immediately after this. This leads to NOPs, consider doing multiple lines at a time + + v_tmp = _mm256_add_epi16(v_last_ref_mul, v_tmp); + dst[d] = _mm256_slli_epi16(v_tmp, shift); + } +} static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} static void intra_pred_planar_ver_w1(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} @@ -737,7 +754,36 @@ static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const dst[d + 1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); } } -static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + + // Got 16 8-bit samples, or 128 bits of data. Duplicate to fill a whole 256-bit vector. + const __m128i v_ref_raw = _mm_load_si128((const __m128i*) &ref[1]); + __m256i v_ref = _mm256_castsi128_si256(v_ref_raw); + v_ref = _mm256_inserti128_si256(v_ref, v_ref_raw, 1); + + // Handle 2 lines at a time + for (int y = 0; y < line; y += 2) { + const int a1 = line - 1 - (y + 0); + const int b1 = (y + 0) + 1; + const int a2 = line - 1 - (y + 1); + const int b2 = (y + 1) + 1; + __m256i v_ys = _mm256_setr_epi8(a1, b1, a1, b1, a1, b1, a1, b1, + a1, b1, a1, b1, a1, b1, a1, b1, + a2, b2, a2, b2, a2, b2, a2, b2, + a2, b2, a2, b2, a2, b2, a2, b2); // TODO: these could be loaded from a table + __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); + __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); + + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); + __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); + v_madd_lo = _mm256_slli_epi16(v_madd_lo, shift); + v_madd_hi = _mm256_slli_epi16(v_madd_hi, shift); + dst[y + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); + dst[y + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); + } +} static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} From 0eb0f110c28d16e256d2077bbd0ddd8c49616ffe Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 8 Sep 2023 12:26:25 +0300 Subject: [PATCH 005/237] Add missing packus to the end of planar calculation. --- src/strategies/avx2/intra-avx2.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a71d9de3..3b9543ec 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -797,7 +797,7 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, color_t color, const uint8_t* const ref_top, const uint8_t* const ref_left, - uint8_t* const dst) + uint8_t* dst) { const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; @@ -831,14 +831,20 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, // debug int16_t* res = (int16_t*)v_res; - /*if (samples == 16) { - + if (samples == 16) { + __m256i v_tmp = _mm256_packus_epi16(v_res[0], v_res[0]); + v_tmp = _mm256_permute4x64_epi64(v_tmp, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i v_tmp2 = _mm256_castsi256_si128(v_tmp); + _mm_store_si128((__m128i*)dst, v_tmp2); } else { - for (int i = 0, s = 0; i < samples; i += 16, s += 2) { - _mm256_store_si256((__m256i*)dst[i], _mm256_packus_epi16(v_res[s + 0], v_res[s + 1])); + for (int i = 0, s = 0; i < samples; i += 32, s += 2) { + __m256i v_tmp = _mm256_packus_epi16(v_res[s + 0], v_res[s + 1]); + v_tmp = _mm256_permute4x64_epi64(v_tmp, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)&dst[i], v_tmp); } - }*/ + } } From b02fb1b1afebf8ce6a04b5749926b7f922e6d6cc Mon Sep 17 00:00:00 2001 From: siivonek Date: Sun, 10 Sep 2023 19:46:42 +0300 Subject: [PATCH 006/237] Remove left shift from planar half functions. Implement the left shift with madd. Planar preds of width 4, 8 and 16 should work now without overflows. Add loop unroll macros to vertical half functions. Will be added to hor half functions later. --- src/strategies/avx2/intra-avx2.c | 309 ++++++++++++++++++++++++------- 1 file changed, 239 insertions(+), 70 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3b9543ec..e935e8a2 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -48,6 +48,77 @@ #include "strategyselector.h" #include "strategies/missing-intel-intrinsics.h" +// Y coord tables +ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { + 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, // offset 0, line == 64 + 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, + 55, 9, 55, 9, 55, 9, 55, 9, 54, 10, 54, 10, 54, 10, 54, 10, 53, 11, 53, 11, 53, 11, 53, 11, 52, 12, 52, 12, 52, 12, 52, 12, + 51, 13, 51, 13, 51, 13, 51, 13, 50, 14, 50, 14, 50, 14, 50, 14, 49, 15, 49, 15, 49, 15, 49, 15, 48, 16, 48, 16, 48, 16, 48, 16, + 47, 17, 47, 17, 47, 17, 47, 17, 46, 18, 46, 18, 46, 18, 46, 18, 45, 19, 45, 19, 45, 19, 45, 19, 44, 20, 44, 20, 44, 20, 44, 20, + 43, 21, 43, 21, 43, 21, 43, 21, 42, 22, 42, 22, 42, 22, 42, 22, 41, 23, 41, 23, 41, 23, 41, 23, 40, 24, 40, 24, 40, 24, 40, 24, + 39, 25, 39, 25, 39, 25, 39, 25, 38, 26, 38, 26, 38, 26, 38, 26, 37, 27, 37, 27, 37, 27, 37, 27, 36, 28, 36, 28, 36, 28, 36, 28, + 35, 29, 35, 29, 35, 29, 35, 29, 34, 30, 34, 30, 34, 30, 34, 30, 33, 31, 33, 31, 33, 31, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 33, 31, 33, 31, 33, 31, 33, 30, 34, 30, 34, 30, 34, 30, 34, 29, 35, 29, 35, 29, 35, 29, 35, 28, 36, 28, 36, 28, 36, 28, 36, + 27, 37, 27, 37, 27, 37, 27, 37, 26, 38, 26, 38, 26, 38, 26, 38, 25, 39, 25, 39, 25, 39, 25, 39, 24, 40, 24, 40, 24, 40, 24, 40, + 23, 41, 23, 41, 23, 41, 23, 41, 22, 42, 22, 42, 22, 42, 22, 42, 21, 43, 21, 43, 21, 43, 21, 43, 20, 44, 20, 44, 20, 44, 20, 44, + 19, 45, 19, 45, 19, 45, 19, 45, 18, 46, 18, 46, 18, 46, 18, 46, 17, 47, 17, 47, 17, 47, 17, 47, 16, 48, 16, 48, 16, 48, 16, 48, + 15, 49, 15, 49, 15, 49, 15, 49, 14, 50, 14, 50, 14, 50, 14, 50, 13, 51, 13, 51, 13, 51, 13, 51, 12, 52, 12, 52, 12, 52, 12, 52, + 11, 53, 11, 53, 11, 53, 11, 53, 10, 54, 10, 54, 10, 54, 10, 54, 9, 55, 9, 55, 9, 55, 9, 55, 8, 56, 8, 56, 8, 56, 8, 56, + 7, 57, 7, 57, 7, 57, 7, 57, 6, 58, 6, 58, 6, 58, 6, 58, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, + 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, + 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, // offset 16, line == 32 + 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, + 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, + 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, + 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, + 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, + 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, + 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 1, 31, 1, 31, 1, 31, 1, 31, 0, 32, 0, 32, 0, 32, 0, 32, + 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, // offset 24, line == 16 + 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, + 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 1, 15, 1, 15, 1, 15, 1, 15, 0, 16, 0, 16, 0, 16, 0, 16, + 7, 1, 7, 1, 7, 1, 7, 1, 6, 2, 6, 2, 6, 2, 6, 2, 5, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, // offset 28, line == 8 + 3, 5, 3, 5, 3, 5, 3, 5, 2, 6, 2, 6, 2, 6, 2, 6, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, + 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, // offset 30, line == 4 + 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 31. line == 2 +}; + +ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2048] = { + 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, + 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, + 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, + 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, + 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, + 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, + 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, + 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, + 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, + 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, + 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, + 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, + 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, + 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, + 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, + 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, + 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, + 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, + 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, + 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, + 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, + 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, + 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, + 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, + 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, + 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, + 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, + 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, + 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, + 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, + 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, +}; + /** * \brief Generate angular predictions. * \param cu_loc CU locationand size data. @@ -633,13 +704,13 @@ static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const for (int i = 0, d = 0; i < line; i += 4, ++d) { // Handle 4 lines at a time + // TODO: setr is VERY SLOW, replace this __m256i v_ref = _mm256_setr_epi16(ref[i + 1], ref[i + 1], ref[i + 1], ref[i + 1], ref[i + 2], ref[i + 2], ref[i + 2], ref[i + 2], ref[i + 3], ref[i + 3], ref[i + 3], ref[i + 3], ref[i + 4], ref[i + 4], ref[i + 4], ref[i + 4]); __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); - v_tmp = _mm256_add_epi16(v_last_ref_mul, v_tmp); - dst[d] = _mm256_slli_epi16(v_tmp, shift); + dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); } } static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) @@ -661,8 +732,7 @@ static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); - v_tmp = _mm256_add_epi16(v_last_ref_mul, v_tmp); - dst[d] = _mm256_slli_epi16(v_tmp, shift); + dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); } } static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) @@ -679,8 +749,7 @@ static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, cons __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); // TODO: the result is needed immediately after this. This leads to NOPs, consider doing multiple lines at a time - v_tmp = _mm256_add_epi16(v_last_ref_mul, v_tmp); - dst[d] = _mm256_slli_epi16(v_tmp, shift); + dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); } } static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} @@ -691,30 +760,53 @@ static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const { const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + // Overflow possible for this width if line > 32 + const bool overflow = line > 32; + // Got four 8-bit references, or 32 bits of data. Duplicate to fill a whole 256-bit vector. const uint32_t* tmp = (const uint32_t*)&ref[1]; // Cast to 32 bit int to load 4 refs at the same time const __m256i v_ref = _mm256_set1_epi32(*tmp); + const __m256i* v_ys = (const __m256i*)planar_avx2_ver_w4ys; + + // Table offset + int offset; + if (line == 64) { + offset = 0; + } + else if (line == 32) { + offset = 16; + } + else if (line == 16) { + offset = 24; + } + else if (line == 8) { + offset = 28; + } + else { // Do not care about lines < 4 since they are illegal + offset = 30; + } + // Handle 4 lines at a time - for (int y = 0, d = 0; y < line; y += 4, ++d) { - const int a1 = line - 1 - (y + 0); - const int a2 = line - 1 - (y + 1); - const int a3 = line - 1 - (y + 2); - const int a4 = line - 1 - (y + 3); - const int b1 = (y + 0) + 1; - const int b2 = (y + 1) + 1; - const int b3 = (y + 2) + 1; - const int b4 = (y + 3) + 1; - - __m256i v_ys = _mm256_setr_epi8(a1, b1, a1, b1, a1, b1, a1, b1, - a2, b2, a2, b2, a2, b2, a2, b2, - a3, b3, a3, b3, a3, b3, a3, b3, - a4, b4, a4, b4, a4, b4, a4, b4); // TODO: these could be loaded from a table - __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); - - __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); - dst[d] = _mm256_slli_epi16(v_madd_lo, shift); + #define UNROLL_LOOP(num) \ + for (int y = 0, s = offset, d = 0; y < (num); y += 4, ++s, ++d) { \ + __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); \ + dst[d] = _mm256_maddubs_epi16(v_lo, v_ys[s]); \ + } + + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { @@ -726,33 +818,57 @@ static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const v_ref = _mm256_inserti128_si256(v_ref, v_ref_raw, 1); v_ref = _mm256_shuffle_epi32(v_ref, _MM_SHUFFLE(1, 1, 0, 0)); - // Handle 4 lines at a time, unless line == 2 - for (int y = 0, d = 0; y < line; y += 4, d += 2) { - const int a1 = line - 1 - (y + 0); - const int b1 = (y + 0) + 1; - const int a2 = line - 1 - (y + 1); - const int b2 = (y + 1) + 1; - const int a3 = line - 1 - (y + 2); - const int b3 = (y + 2) + 1; - const int a4 = line - 1 - (y + 3); - const int b4 = (y + 3) + 1; - __m256i v_ys = _mm256_setr_epi8(a1, b1, a1, b1, a1, b1, a1, b1, - a2, b2, a2, b2, a2, b2, a2, b2, - a3, b3, a3, b3, a3, b3, a3, b3, - a4, b4, a4, b4, a4, b4, a4, b4); // TODO: these could be loaded from a table - __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); - __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); - - __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); - __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); - v_madd_lo = _mm256_slli_epi16(v_madd_lo, shift); - v_madd_hi = _mm256_slli_epi16(v_madd_hi, shift); - __m256i v_tmp0 = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); - __m256i v_tmp1 = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); - - dst[d + 0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); - dst[d + 1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); + const __m256i* v_ys = (const __m256i*)planar_avx2_ver_w4ys; + + // Table offset + int offset; + if (line == 64) { + offset = 0; + } + else if (line == 32) { + offset = 16; + } + else if (line == 16) { + offset = 24; + } + else if (line == 8) { + offset = 28; + } + else if (line == 4) { + offset = 30; + } + else { // Do not care about line == 1 since it is illegal for this width + offset = 31; + } + + // Handle 4 lines at a time + #define UNROLL_LOOP(num) \ + for (int y = 0, s = offset, d = 0; y < (num); y += 4, ++s, d += 2) { \ + __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); \ + __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); \ + \ + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys[s]); \ + __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys[s]); \ + __m256i v_tmp0 = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); \ + __m256i v_tmp1 = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); \ + \ + dst[d + 0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); \ + dst[d + 1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); \ + } + + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { @@ -763,26 +879,55 @@ static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, cons __m256i v_ref = _mm256_castsi128_si256(v_ref_raw); v_ref = _mm256_inserti128_si256(v_ref, v_ref_raw, 1); + const __m256i* v_ys = (const __m256i*)planar_avx2_ver_w8ys; + + // Table offset + int offset; + if (line == 64) { + offset = 0; + } + else if (line == 32) { + offset = 16; + } + else if (line == 16) { + offset = 24; + } + else if (line == 8) { + offset = 28; + } + else if (line == 4) { + offset = 30; + } + else { // Do not care about line == 1 since it is illegal for this width + offset = 31; + } + + // These stay constant through the loop + const __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); + const __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); + // Handle 2 lines at a time - for (int y = 0; y < line; y += 2) { - const int a1 = line - 1 - (y + 0); - const int b1 = (y + 0) + 1; - const int a2 = line - 1 - (y + 1); - const int b2 = (y + 1) + 1; - __m256i v_ys = _mm256_setr_epi8(a1, b1, a1, b1, a1, b1, a1, b1, - a1, b1, a1, b1, a1, b1, a1, b1, - a2, b2, a2, b2, a2, b2, a2, b2, - a2, b2, a2, b2, a2, b2, a2, b2); // TODO: these could be loaded from a table - __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); - __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); - - __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); - __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); - v_madd_lo = _mm256_slli_epi16(v_madd_lo, shift); - v_madd_hi = _mm256_slli_epi16(v_madd_hi, shift); - dst[y + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); - dst[y + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); + #define UNROLL_LOOP(num) \ + for (int y = 0, s = offset; y < (num); y += 2, ++s) { \ + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys[s]); \ + __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys[s]); \ + dst[y + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); \ + dst[y + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); \ + } + + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} @@ -802,7 +947,7 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int samples = width * height; - const __m256i v_samples = _mm256_set1_epi16(samples); + const __m256i v_samples = _mm256_set1_epi32(samples); const int log2_width = uvg_g_convert_to_log2[width]; const int log2_height = uvg_g_convert_to_log2[height]; @@ -821,11 +966,35 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, int16_t* hor_res = (int16_t*)v_pred_hor; int16_t* ver_res = (int16_t*)v_pred_ver; + // Cast two 16-bit values to 32-bit and fill a 256-bit vector + int16_t tmp[2] = {height, width}; + int32_t* tmp2 = (int32_t*)tmp; + const __m256i v_madd_shift = _mm256_set1_epi32(*tmp2); + __m256i v_res[64]; - for (int i = 0, d = 0; i < samples; i += 16, ++d) { + // Old loop + /*for (int i = 0, d = 0; i < samples; i += 16, ++d) { v_res[d] = _mm256_add_epi16(v_pred_ver[d], v_pred_hor[d]); v_res[d] = _mm256_add_epi16(v_res[d], v_samples); v_res[d] = _mm256_srli_epi16(v_res[d], shift_r); + }*/ + + // New loop + for (int i = 0, d = 0; i < samples; i += 16, ++d) { + __m256i v_lo = _mm256_unpacklo_epi16(v_pred_hor[d], v_pred_ver[d]); + __m256i v_hi = _mm256_unpackhi_epi16(v_pred_hor[d], v_pred_ver[d]); + + // madd will extend the intermediate results to 32-bit to avoid overflows + __m256i v_madd_lo = _mm256_madd_epi16(v_lo, v_madd_shift); + __m256i v_madd_hi = _mm256_madd_epi16(v_hi, v_madd_shift); + + v_madd_lo = _mm256_add_epi32(v_madd_lo, v_samples); + v_madd_hi = _mm256_add_epi32(v_madd_hi, v_samples); + + v_madd_lo = _mm256_srli_epi32(v_madd_lo, shift_r); + v_madd_hi = _mm256_srli_epi32(v_madd_hi, shift_r); + + v_res[d] = _mm256_packs_epi32(v_madd_lo, v_madd_hi); } // debug From 8bbf01c37682ca01c380d1e82a2ce08f52e85829 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 12 Sep 2023 14:07:30 +0300 Subject: [PATCH 007/237] Change the right shift in pred_planar_avx2 to use a 128 bit register version of the right shift instrinsics, since when the integer version does not have a compile time constant the compiler is forced to generate the 128 bit register using version anyways, but also has to convert the integer to the 128 bit register, and the compiler does not optimize this properly and instead does the conversion on every call of the loop. ***THIS NEEDS TO BE DONE FOR ALL SHIFTS THAT DO NOT USE COMPILE TIME CONSTANT SHIFTS*** --- src/strategies/avx2/intra-avx2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e935e8a2..17f63414 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -980,6 +980,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, }*/ // New loop + __m128i shift_r_v = _mm_setzero_si128(); + shift_r_v = _mm_insert_epi32(shift_r_v, shift_r, 0); for (int i = 0, d = 0; i < samples; i += 16, ++d) { __m256i v_lo = _mm256_unpacklo_epi16(v_pred_hor[d], v_pred_ver[d]); __m256i v_hi = _mm256_unpackhi_epi16(v_pred_hor[d], v_pred_ver[d]); @@ -991,8 +993,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, v_madd_lo = _mm256_add_epi32(v_madd_lo, v_samples); v_madd_hi = _mm256_add_epi32(v_madd_hi, v_samples); - v_madd_lo = _mm256_srli_epi32(v_madd_lo, shift_r); - v_madd_hi = _mm256_srli_epi32(v_madd_hi, shift_r); + v_madd_lo = _mm256_srl_epi32(v_madd_lo, shift_r_v); + v_madd_hi = _mm256_srl_epi32(v_madd_hi, shift_r_v); v_res[d] = _mm256_packs_epi32(v_madd_lo, v_madd_hi); } From 4e4084434e5fe834fc625a06b77496050e53dfdb Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 12 Sep 2023 14:48:33 +0300 Subject: [PATCH 008/237] Remove setr from the loop in hor_w4 --- src/strategies/avx2/intra-avx2.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 17f63414..9c6f8cb8 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -701,12 +701,16 @@ static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4); __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + __m256i shuffle_mask = _mm256_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1, 0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1); for (int i = 0, d = 0; i < line; i += 4, ++d) { // Handle 4 lines at a time - // TODO: setr is VERY SLOW, replace this - __m256i v_ref = _mm256_setr_epi16(ref[i + 1], ref[i + 1], ref[i + 1], ref[i + 1], ref[i + 2], ref[i + 2], ref[i + 2], ref[i + 2], - ref[i + 3], ref[i + 3], ref[i + 3], ref[i + 3], ref[i + 4], ref[i + 4], ref[i + 4], ref[i + 4]); + // | ref1 | ref2 | ref3 | ref4 | Don't care + __m128i v_ref_0 = _mm_loadu_si128((__m128i const*)& ref[i + 1]); + // | ref1 | 0 * 7 | ref2 | 0 * 7 | ref3 | 0 * 7 | ref4 | 0* 7 | + __m256i v_ref = _mm256_cvtepu8_epi64(v_ref_0); + // | ref1_l | ref1_h | ref1_l | ref1_h | ... + v_ref = _mm256_shuffle_epi8(v_ref, shuffle_mask); __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); From 2e468b7014f97d6283ab7d548d76a50abc8265ec Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 12 Sep 2023 14:49:50 +0300 Subject: [PATCH 009/237] Add full w8 y coordinate table. --- src/strategies/avx2/intra-avx2.c | 36 ++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9c6f8cb8..0dea819a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -84,8 +84,8 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 31. line == 2 }; -ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2048] = { - 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, +ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[4096] = { + 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, // offset 0, line == 64 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, @@ -117,6 +117,38 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2048] = { 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, + 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, // offset 32, line == 32 + 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, + 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, + 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, + 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, + 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, + 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, + 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, + 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, + 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, + 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, + 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, + 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, + 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, + 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, + 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, // offset 48, line == 16 + 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, + 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, // offset 56, line == 8 + 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, + 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, + 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // offset 60, line == 4 + 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 62, line == 2 + 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 63, line == 1, this might not be needed, ever }; /** From c470e81ea2524aca7dfdc2a38033cfead1d7e617 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 12 Sep 2023 16:05:31 +0300 Subject: [PATCH 010/237] Add unroll macro for planar horizontal halfs. --- src/strategies/avx2/intra-avx2.c | 97 +++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0dea819a..1c940ecb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -735,19 +735,34 @@ static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); __m256i shuffle_mask = _mm256_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1, 0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1); - for (int i = 0, d = 0; i < line; i += 4, ++d) { - // Handle 4 lines at a time - // | ref1 | ref2 | ref3 | ref4 | Don't care - __m128i v_ref_0 = _mm_loadu_si128((__m128i const*)& ref[i + 1]); - // | ref1 | 0 * 7 | ref2 | 0 * 7 | ref3 | 0 * 7 | ref4 | 0* 7 | - __m256i v_ref = _mm256_cvtepu8_epi64(v_ref_0); - // | ref1_l | ref1_h | ref1_l | ref1_h | ... - v_ref = _mm256_shuffle_epi8(v_ref, shuffle_mask); - - __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); - - dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); + // Handle 4 lines at a time + #define UNROLL_LOOP(num) \ + for (int i = 0, d = 0; i < (num); i += 4, ++d) { \ + /* | ref1 | ref2 | ref3 | ref4 | Don't care*/ \ + __m128i v_ref_0 = _mm_loadu_si128((__m128i const*)& ref[i + 1]); \ + /* | ref1 | 0 * 7 | ref2 | 0 * 7 | ref3 | 0 * 7 | ref4 | 0* 7 | */ \ + __m256i v_ref = _mm256_cvtepu8_epi64(v_ref_0); \ + /* | ref1_l | ref1_h | ref1_l | ref1_h | ... */ \ + v_ref = _mm256_shuffle_epi8(v_ref, shuffle_mask); \ + \ + __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); \ + \ + dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); \ } + + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; + } + #undef UNROLL_LOOP } static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { @@ -758,18 +773,33 @@ static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); - for (int i = 0, d = 0; i < line; i += 2, ++d) { - // Handle 2 lines at a time - __m128i v_ref0 = _mm_set1_epi16(ref[i + 1]); - __m128i v_ref1 = _mm_set1_epi16(ref[i + 2]); - - __m256i v_ref = _mm256_castsi128_si256(v_ref0); - v_ref = _mm256_inserti128_si256(v_ref, v_ref1, 1); - - __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); + // Handle 2 lines at a time + #define UNROLL_LOOP(num) \ + for (int i = 0, d = 0; i < (num); i += 2, ++d) { \ + __m128i v_ref0 = _mm_set1_epi16(ref[i + 1]); \ + __m128i v_ref1 = _mm_set1_epi16(ref[i + 2]); \ + \ + __m256i v_ref = _mm256_castsi128_si256(v_ref0); \ + v_ref = _mm256_inserti128_si256(v_ref, v_ref1, 1); \ + \ + __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); \ + \ + dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); \ + } - dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { @@ -780,13 +810,26 @@ static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, cons __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); - for (int i = 0, d = 0; i < line; ++i, ++d) { - __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); - - __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); // TODO: the result is needed immediately after this. This leads to NOPs, consider doing multiple lines at a time + #define UNROLL_LOOP(num) \ + for (int i = 0, d = 0; i < (num); ++i, ++d) { \ + __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); \ + __m256i v_tmp = _mm256_mullo_epi16(v_ref, v_ref_coeff); \ + dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); \ + } - dst[d] = _mm256_add_epi16(v_last_ref_mul, v_tmp); + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} From d5cdce350b06a5e9ee151a744c3fa94fbc848b4f Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 12 Sep 2023 17:18:32 +0300 Subject: [PATCH 011/237] Implement 32 width planar hor and ver halves. --- src/strategies/avx2/intra-avx2.c | 45 ++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1c940ecb..49b72bc2 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -831,7 +831,27 @@ static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } -static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi16(ref[32 + 1]); + + __m256i v_ref_coeff0 = _mm256_setr_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16); + __m256i v_ref_coeff1 = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + __m256i v_last_ref_coeff0 = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + __m256i v_last_ref_coeff1 = _mm256_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + + __m256i v_last_ref_mul0 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff0); + __m256i v_last_ref_mul1 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff1); + + for (int i = 0, d = 0; i < line; ++i, d += 2) { + __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); + __m256i v_tmp0 = _mm256_mullo_epi16(v_ref, v_ref_coeff0); + __m256i v_tmp1 = _mm256_mullo_epi16(v_ref, v_ref_coeff1); + dst[d + 0] = _mm256_add_epi16(v_last_ref_mul0, v_tmp0); + dst[d + 1] = _mm256_add_epi16(v_last_ref_mul1, v_tmp1); + } +} static void intra_pred_planar_ver_w1(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} static void intra_pred_planar_ver_w2(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} @@ -1008,7 +1028,28 @@ static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } -static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} +static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + + // Got 32 8-bit samples, or 256 bits of data. Load into a single vector + const __m256i v_ref = _mm256_load_si256((const __m256i*) &ref[1]); + + // These stay constant through the loop + const __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); + const __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); + + for (int y = 0, a = line - 1, b = 1, d = 0; y < line; ++y, --a, ++b, d += 2) { + int8_t tmp[2] = {a, b}; + int16_t* tmp2 = (int16_t*)tmp; + const __m256i v_ys = _mm256_set1_epi16(*tmp2); + + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); + __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); + dst[d + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); + dst[d + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); + } +} static intra_planar_half_func* planar_func_table[2][6] = { From 84a28c8febde8fd34b06b2c0dcc2298920b1821e Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 12 Sep 2023 17:35:27 +0300 Subject: [PATCH 012/237] Fix memory overflows. Extended temp buffers to hold results for 64x64 predictions. --- src/strategies/avx2/intra-avx2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 49b72bc2..e3ddc22f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1062,7 +1062,7 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, color_t color, const uint8_t* const ref_top, const uint8_t* const ref_left, - uint8_t* dst) + uvg_pixel* dst) { const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; @@ -1073,8 +1073,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, const int log2_height = uvg_g_convert_to_log2[height]; const int shift_r = log2_width + log2_height + 1; - __m256i v_pred_hor[64]; - __m256i v_pred_ver[64]; + __m256i v_pred_hor[256]; + __m256i v_pred_ver[256]; intra_planar_half_func* planar_hor = planar_func_table[0][log2_width]; intra_planar_half_func* planar_ver = planar_func_table[1][log2_width]; @@ -1091,7 +1091,7 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, int32_t* tmp2 = (int32_t*)tmp; const __m256i v_madd_shift = _mm256_set1_epi32(*tmp2); - __m256i v_res[64]; + __m256i v_res[256]; // Old loop /*for (int i = 0, d = 0; i < samples; i += 16, ++d) { v_res[d] = _mm256_add_epi16(v_pred_ver[d], v_pred_hor[d]); From 4457c3f9b4a47476bb3a8774c765e773ac129ccf Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 13 Sep 2023 12:46:38 +0300 Subject: [PATCH 013/237] Add unroll macro for 32 width planar predictions. --- src/strategies/avx2/intra-avx2.c | 58 ++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e3ddc22f..89fe7064 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -844,13 +844,28 @@ static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, cons __m256i v_last_ref_mul0 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff0); __m256i v_last_ref_mul1 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff1); - for (int i = 0, d = 0; i < line; ++i, d += 2) { - __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); - __m256i v_tmp0 = _mm256_mullo_epi16(v_ref, v_ref_coeff0); - __m256i v_tmp1 = _mm256_mullo_epi16(v_ref, v_ref_coeff1); - dst[d + 0] = _mm256_add_epi16(v_last_ref_mul0, v_tmp0); - dst[d + 1] = _mm256_add_epi16(v_last_ref_mul1, v_tmp1); + #define UNROLL_LOOP(num) \ + for (int i = 0, d = 0; i < (num); ++i, d += 2) { \ + __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); \ + __m256i v_tmp0 = _mm256_mullo_epi16(v_ref, v_ref_coeff0); \ + __m256i v_tmp1 = _mm256_mullo_epi16(v_ref, v_ref_coeff1); \ + dst[d + 0] = _mm256_add_epi16(v_last_ref_mul0, v_tmp0); \ + dst[d + 1] = _mm256_add_epi16(v_last_ref_mul1, v_tmp1); \ + } + + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } static void intra_pred_planar_ver_w1(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} @@ -1039,16 +1054,31 @@ static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, cons const __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); const __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); - for (int y = 0, a = line - 1, b = 1, d = 0; y < line; ++y, --a, ++b, d += 2) { - int8_t tmp[2] = {a, b}; - int16_t* tmp2 = (int16_t*)tmp; - const __m256i v_ys = _mm256_set1_epi16(*tmp2); + #define UNROLL_LOOP(num) \ + for (int y = 0, a = (num) - 1, b = 1, d = 0; y < (num); ++y, --a, ++b, d += 2) { \ + int8_t tmp[2] = {a, b}; \ + int16_t* tmp2 = (int16_t*)tmp; \ + const __m256i v_ys = _mm256_set1_epi16(*tmp2); \ + \ + __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); \ + __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); \ + dst[d + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); \ + dst[d + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); \ + } - __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); - __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys); - dst[d + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); - dst[d + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); + switch (line) { + case 1: UNROLL_LOOP(1); break; + case 2: UNROLL_LOOP(2); break; + case 4: UNROLL_LOOP(4); break; + case 8: UNROLL_LOOP(8); break; + case 16: UNROLL_LOOP(16); break; + case 32: UNROLL_LOOP(32); break; + case 64: UNROLL_LOOP(64); break; + default: + assert(false && "Invalid dimension."); + break; } + #undef UNROLL_LOOP } From c69afa1f8e30e9857ae2a45800bfff2b21e97b21 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 13 Sep 2023 16:53:31 +0300 Subject: [PATCH 014/237] Fix planar hor width 16 to work with height 1 and 2. --- src/strategies/avx2/intra-avx2.c | 222 +++++++++++++++++++------------ 1 file changed, 138 insertions(+), 84 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 89fe7064..ce35777d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -84,7 +84,8 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 31. line == 2 }; -ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[4096] = { +// TODO: Reduce size back to 2048 if last line is not needed +ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, // offset 0, line == 64 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, @@ -147,8 +148,9 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[4096] = { 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // offset 60, line == 4 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 62, line == 2 - 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 63, line == 1, this might not be needed, ever + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // offset 62, line == 2 + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // offset 64, line == 1, this might not be needed, ever }; /** @@ -729,11 +731,11 @@ static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const { const __m256i v_last_ref = _mm256_set1_epi16(ref[4 + 1]); - __m256i v_ref_coeff = _mm256_setr_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); - __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4); + const __m256i v_ref_coeff = _mm256_setr_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4); - __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); - __m256i shuffle_mask = _mm256_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1, 0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1); + const __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + const __m256i shuffle_mask = _mm256_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1, 0, -1, 0, -1, 0, -1, 0, -1, 8, -1, 8, -1, 8, -1, 8, -1); // Handle 4 lines at a time #define UNROLL_LOOP(num) \ @@ -768,10 +770,10 @@ static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const { const __m256i v_last_ref = _mm256_set1_epi16(ref[8 + 1]); - __m256i v_ref_coeff = _mm256_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0); - __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + const __m256i v_ref_coeff = _mm256_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0); + const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + const __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); // Handle 2 lines at a time #define UNROLL_LOOP(num) \ @@ -805,10 +807,10 @@ static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, cons { const __m256i v_last_ref = _mm256_set1_epi16(ref[16 + 1]); - __m256i v_ref_coeff = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + const __m256i v_ref_coeff = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); + const __m256i v_last_ref_mul = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff); #define UNROLL_LOOP(num) \ for (int i = 0, d = 0; i < (num); ++i, ++d) { \ @@ -835,14 +837,14 @@ static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, cons { const __m256i v_last_ref = _mm256_set1_epi16(ref[32 + 1]); - __m256i v_ref_coeff0 = _mm256_setr_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16); - __m256i v_ref_coeff1 = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m256i v_ref_coeff0 = _mm256_setr_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16); + const __m256i v_ref_coeff1 = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - __m256i v_last_ref_coeff0 = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - __m256i v_last_ref_coeff1 = _mm256_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + const __m256i v_last_ref_coeff0 = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + const __m256i v_last_ref_coeff1 = _mm256_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); - __m256i v_last_ref_mul0 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff0); - __m256i v_last_ref_mul1 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff1); + const __m256i v_last_ref_mul0 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff0); + const __m256i v_last_ref_mul1 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff1); #define UNROLL_LOOP(num) \ for (int i = 0, d = 0; i < (num); ++i, d += 2) { \ @@ -867,9 +869,38 @@ static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } +static void intra_pred_planar_hor_w64(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi16(ref[64 + 1]); + + const __m256i v_ref_coeff0 = _mm256_setr_epi16(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48); + const __m256i v_ref_coeff1 = _mm256_setr_epi16(47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32); + const __m256i v_ref_coeff2 = _mm256_setr_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16); + const __m256i v_ref_coeff3 = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + const __m256i v_last_ref_coeff0 = _mm256_setr_epi16( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + const __m256i v_last_ref_coeff1 = _mm256_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + const __m256i v_last_ref_coeff2 = _mm256_setr_epi16(33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48); + const __m256i v_last_ref_coeff3 = _mm256_setr_epi16(49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64); + + const __m256i v_last_ref_mul0 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff0); + const __m256i v_last_ref_mul1 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff1); + const __m256i v_last_ref_mul2 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff2); + const __m256i v_last_ref_mul3 = _mm256_mullo_epi16(v_last_ref, v_last_ref_coeff3); + + for (int i = 0, d = 0; i < line; ++i, d += 4) { + __m256i v_ref = _mm256_set1_epi16(ref[i + 1]); + __m256i v_tmp0 = _mm256_mullo_epi16(v_ref, v_ref_coeff0); + __m256i v_tmp1 = _mm256_mullo_epi16(v_ref, v_ref_coeff1); + __m256i v_tmp2 = _mm256_mullo_epi16(v_ref, v_ref_coeff2); + __m256i v_tmp3 = _mm256_mullo_epi16(v_ref, v_ref_coeff3); + dst[d + 0] = _mm256_add_epi16(v_last_ref_mul0, v_tmp0); + dst[d + 1] = _mm256_add_epi16(v_last_ref_mul1, v_tmp1); + dst[d + 2] = _mm256_add_epi16(v_last_ref_mul2, v_tmp2); + dst[d + 3] = _mm256_add_epi16(v_last_ref_mul3, v_tmp3); + } +} -static void intra_pred_planar_ver_w1(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} -static void intra_pred_planar_ver_w2(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) {} static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); @@ -885,20 +916,15 @@ static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const // Table offset int offset; - if (line == 64) { - offset = 0; - } - else if (line == 32) { - offset = 16; - } - else if (line == 16) { - offset = 24; - } - else if (line == 8) { - offset = 28; - } - else { // Do not care about lines < 4 since they are illegal - offset = 30; + switch (line) { + case 64: offset = 0; break; + case 32: offset = 16; break; + case 16: offset = 24; break; + case 8: offset = 28; break; + case 4: offset = 30; break; + default: + assert(false && "Invalid height for width 4."); + break; } // Handle 4 lines at a time @@ -927,7 +953,7 @@ static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); // Got eight 8-bit samples, or 64 bits of data. Duplicate to fill a whole 256-bit vector. - const __m128i v_ref_raw = _mm_load_si128((const __m128i*)&ref[1]); + const __m128i v_ref_raw = _mm_loadu_si128((const __m128i*)&ref[1]); __m256i v_ref = _mm256_castsi128_si256(v_ref_raw); v_ref = _mm256_inserti128_si256(v_ref, v_ref_raw, 1); v_ref = _mm256_shuffle_epi32(v_ref, _MM_SHUFFLE(1, 1, 0, 0)); @@ -936,23 +962,16 @@ static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const // Table offset int offset; - if (line == 64) { - offset = 0; - } - else if (line == 32) { - offset = 16; - } - else if (line == 16) { - offset = 24; - } - else if (line == 8) { - offset = 28; - } - else if (line == 4) { - offset = 30; - } - else { // Do not care about line == 1 since it is illegal for this width - offset = 31; + switch (line) { + case 64: offset = 0; break; + case 32: offset = 16; break; + case 16: offset = 24; break; + case 8: offset = 28; break; + case 4: offset = 30; break; + case 2: offset = 31; break; + default: + assert(false && "Invalid height for width 8."); + break; } // Handle 4 lines at a time @@ -989,7 +1008,7 @@ static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, cons const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); // Got 16 8-bit samples, or 128 bits of data. Duplicate to fill a whole 256-bit vector. - const __m128i v_ref_raw = _mm_load_si128((const __m128i*) &ref[1]); + const __m128i v_ref_raw = _mm_loadu_si128((const __m128i*) &ref[1]); __m256i v_ref = _mm256_castsi128_si256(v_ref_raw); v_ref = _mm256_inserti128_si256(v_ref, v_ref_raw, 1); @@ -997,41 +1016,46 @@ static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, cons // Table offset int offset; - if (line == 64) { - offset = 0; - } - else if (line == 32) { - offset = 16; - } - else if (line == 16) { - offset = 24; - } - else if (line == 8) { - offset = 28; - } - else if (line == 4) { - offset = 30; - } - else { // Do not care about line == 1 since it is illegal for this width - offset = 31; + switch (line) { + case 64: offset = 0; break; + case 32: offset = 32; break; + case 16: offset = 48; break; + case 8: offset = 56; break; + case 4: offset = 60; break; + case 2: offset = 62; break; + case 1: offset = 64; break; + default: + assert(false && "Invalid height for width 16."); + break; } + // Calculations for cases where line > 2 // These stay constant through the loop const __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); const __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); // Handle 2 lines at a time #define UNROLL_LOOP(num) \ - for (int y = 0, s = offset; y < (num); y += 2, ++s) { \ + for (int y = 0, s = offset; y < line; y += 2, ++s) { \ __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys[s]); \ __m256i v_madd_hi = _mm256_maddubs_epi16(v_hi, v_ys[s]); \ dst[y + 0] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x20); \ dst[y + 1] = _mm256_permute2x128_si256(v_madd_lo, v_madd_hi, 0x31); \ } + __m256i v_tmp; switch (line) { - case 1: UNROLL_LOOP(1); break; - case 2: UNROLL_LOOP(2); break; + case 1: + // Specialized calculation for line == 1 + v_tmp = _mm256_permute2x128_si256(v_lo, v_hi, 0x20); + dst[0] = _mm256_maddubs_epi16(v_tmp, v_ys[offset + 0]); + break; + case 2: + // Specialized calculation for line == 2 + v_tmp = _mm256_permute2x128_si256(v_lo, v_hi, 0x20); + dst[0] = _mm256_maddubs_epi16(v_tmp, v_ys[offset + 0]); + dst[1] = _mm256_maddubs_epi16(v_tmp, v_ys[offset + 1]); + break; case 4: UNROLL_LOOP(4); break; case 8: UNROLL_LOOP(8); break; case 16: UNROLL_LOOP(16); break; @@ -1041,23 +1065,23 @@ static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, cons assert(false && "Invalid dimension."); break; } - #undef UNROLL_LOOP +#undef UNROLL_LOOP } static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) { const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); // Got 32 8-bit samples, or 256 bits of data. Load into a single vector - const __m256i v_ref = _mm256_load_si256((const __m256i*) &ref[1]); + const __m256i v_ref = _mm256_loadu_si256((const __m256i*) &ref[1]); // These stay constant through the loop const __m256i v_lo = _mm256_unpacklo_epi8(v_ref, v_last_ref); const __m256i v_hi = _mm256_unpackhi_epi8(v_ref, v_last_ref); #define UNROLL_LOOP(num) \ - for (int y = 0, a = (num) - 1, b = 1, d = 0; y < (num); ++y, --a, ++b, d += 2) { \ - int8_t tmp[2] = {a, b}; \ - int16_t* tmp2 = (int16_t*)tmp; \ + for (uint8_t y = 0, a = (num) - 1, b = 1, d = 0; y < (num); ++y, --a, ++b, d += 2) { \ + uint8_t tmp[2] = {a, b}; \ + uint16_t* tmp2 = (uint16_t*)tmp; \ const __m256i v_ys = _mm256_set1_epi16(*tmp2); \ \ __m256i v_madd_lo = _mm256_maddubs_epi16(v_lo, v_ys); \ @@ -1080,11 +1104,41 @@ static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } +static void intra_pred_planar_ver_w64(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +{ + const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + + // Got 64 8-bit samples, or 512 bits of data. Load into two vectors + const __m256i v_ref0 = _mm256_loadu_si256((const __m256i*) &ref[1]); + const __m256i v_ref1 = _mm256_loadu_si256((const __m256i*) &ref[33]); + + // These stay constant through the loop + const __m256i v_lo0 = _mm256_unpacklo_epi8(v_ref0, v_last_ref); + const __m256i v_lo1 = _mm256_unpacklo_epi8(v_ref1, v_last_ref); + const __m256i v_hi0 = _mm256_unpackhi_epi8(v_ref0, v_last_ref); + const __m256i v_hi1 = _mm256_unpackhi_epi8(v_ref1, v_last_ref); + + for (uint8_t y = 0, a = line - 1, b = 1, d = 0; y < line; ++y, --a, ++b, d += 4) { + uint8_t tmp[2] = {a, b}; + uint16_t* tmp2 = (uint16_t*)tmp; + const __m256i v_ys = _mm256_set1_epi16(*tmp2); + + __m256i v_madd_lo0 = _mm256_maddubs_epi16(v_lo0, v_ys); + __m256i v_madd_lo1 = _mm256_maddubs_epi16(v_lo1, v_ys); + __m256i v_madd_hi0 = _mm256_maddubs_epi16(v_hi0, v_ys); + __m256i v_madd_hi1 = _mm256_maddubs_epi16(v_hi1, v_ys); + + dst[d + 0] = _mm256_permute2x128_si256(v_madd_lo0, v_madd_hi0, 0x20); + dst[d + 1] = _mm256_permute2x128_si256(v_madd_lo0, v_madd_hi0, 0x31); + dst[d + 2] = _mm256_permute2x128_si256(v_madd_lo1, v_madd_hi1, 0x20); + dst[d + 3] = _mm256_permute2x128_si256(v_madd_lo1, v_madd_hi1, 0x31); + } +} -static intra_planar_half_func* planar_func_table[2][6] = { - { NULL, NULL, intra_pred_planar_hor_w4, intra_pred_planar_hor_w8, intra_pred_planar_hor_w16, intra_pred_planar_hor_w32,}, - {intra_pred_planar_ver_w1, intra_pred_planar_ver_w2, intra_pred_planar_ver_w4, intra_pred_planar_ver_w8, intra_pred_planar_ver_w16, intra_pred_planar_ver_w32,} +static intra_planar_half_func* planar_func_table[2][7] = { + { NULL, NULL, intra_pred_planar_hor_w4, intra_pred_planar_hor_w8, intra_pred_planar_hor_w16, intra_pred_planar_hor_w32, intra_pred_planar_hor_w64}, + { NULL, NULL, intra_pred_planar_ver_w4, intra_pred_planar_ver_w8, intra_pred_planar_ver_w16, intra_pred_planar_ver_w32, intra_pred_planar_ver_w64} }; @@ -1094,8 +1148,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, const uint8_t* const ref_left, uvg_pixel* dst) { - const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int16_t width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int16_t height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int samples = width * height; const __m256i v_samples = _mm256_set1_epi32(samples); From 182d977451acc862b52fd31195078917fd61e239 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 17 Apr 2024 17:17:52 +0300 Subject: [PATCH 015/237] Fix planar. The planar mode only worked by accident since it was tested with the old version of speed bench, where both references, top and left, were the same. After changing how the random input values mode works (it now uses different random seed for both reference arrays), the error surfaced. --- src/strategies/avx2/intra-avx2.c | 46 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index ce35777d..a4ef4747 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -722,14 +722,14 @@ static void uvg_intra_pred_planar_avx2_old( } -typedef void (intra_planar_half_func)(const uvg_pixel* ref, const int line, const int shift, __m256i* dst); +typedef void (intra_planar_half_func)(const uvg_pixel* ref_main, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst); // w1 and w2 for planar horizontal do not exist, since intra prediction must be at least of width 4 // Also worth noting is that minimum amount of samples must be 16, // therefore the smallest possible predictions are 4x4, 8x2 and 16x1 -static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi16(ref[4 + 1]); + const __m256i v_last_ref = _mm256_set1_epi16(ref_side[4 + 1]); const __m256i v_ref_coeff = _mm256_setr_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4); @@ -766,9 +766,9 @@ static void intra_pred_planar_hor_w4(const uvg_pixel* ref, const int line, const } #undef UNROLL_LOOP } -static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi16(ref[8 + 1]); + const __m256i v_last_ref = _mm256_set1_epi16(ref_side[8 + 1]); const __m256i v_ref_coeff = _mm256_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0); const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); @@ -803,9 +803,9 @@ static void intra_pred_planar_hor_w8(const uvg_pixel* ref, const int line, const } #undef UNROLL_LOOP } -static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi16(ref[16 + 1]); + const __m256i v_last_ref = _mm256_set1_epi16(ref_side[16 + 1]); const __m256i v_ref_coeff = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); const __m256i v_last_ref_coeff = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); @@ -833,9 +833,9 @@ static void intra_pred_planar_hor_w16(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } -static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi16(ref[32 + 1]); + const __m256i v_last_ref = _mm256_set1_epi16(ref_side[32 + 1]); const __m256i v_ref_coeff0 = _mm256_setr_epi16(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16); const __m256i v_ref_coeff1 = _mm256_setr_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -869,9 +869,9 @@ static void intra_pred_planar_hor_w32(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } -static void intra_pred_planar_hor_w64(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_hor_w64(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi16(ref[64 + 1]); + const __m256i v_last_ref = _mm256_set1_epi16(ref_side[64 + 1]); const __m256i v_ref_coeff0 = _mm256_setr_epi16(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48); const __m256i v_ref_coeff1 = _mm256_setr_epi16(47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32); @@ -901,9 +901,9 @@ static void intra_pred_planar_hor_w64(const uvg_pixel* ref, const int line, cons } } -static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + const __m256i v_last_ref = _mm256_set1_epi8(ref_side[line + 1]); // Overflow possible for this width if line > 32 const bool overflow = line > 32; @@ -948,9 +948,9 @@ static void intra_pred_planar_ver_w4(const uvg_pixel* ref, const int line, const } #undef UNROLL_LOOP } -static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + const __m256i v_last_ref = _mm256_set1_epi8(ref_side[line + 1]); // Got eight 8-bit samples, or 64 bits of data. Duplicate to fill a whole 256-bit vector. const __m128i v_ref_raw = _mm_loadu_si128((const __m128i*)&ref[1]); @@ -1003,9 +1003,9 @@ static void intra_pred_planar_ver_w8(const uvg_pixel* ref, const int line, const } #undef UNROLL_LOOP } -static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + const __m256i v_last_ref = _mm256_set1_epi8(ref_side[line + 1]); // Got 16 8-bit samples, or 128 bits of data. Duplicate to fill a whole 256-bit vector. const __m128i v_ref_raw = _mm_loadu_si128((const __m128i*) &ref[1]); @@ -1067,9 +1067,9 @@ static void intra_pred_planar_ver_w16(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } -static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + const __m256i v_last_ref = _mm256_set1_epi8(ref_side[line + 1]); // Got 32 8-bit samples, or 256 bits of data. Load into a single vector const __m256i v_ref = _mm256_loadu_si256((const __m256i*) &ref[1]); @@ -1104,9 +1104,9 @@ static void intra_pred_planar_ver_w32(const uvg_pixel* ref, const int line, cons } #undef UNROLL_LOOP } -static void intra_pred_planar_ver_w64(const uvg_pixel* ref, const int line, const int shift, __m256i* dst) +static void intra_pred_planar_ver_w64(const uvg_pixel* ref, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst) { - const __m256i v_last_ref = _mm256_set1_epi8(ref[line + 1]); + const __m256i v_last_ref = _mm256_set1_epi8(ref_side[line + 1]); // Got 64 8-bit samples, or 512 bits of data. Load into two vectors const __m256i v_ref0 = _mm256_loadu_si256((const __m256i*) &ref[1]); @@ -1163,8 +1163,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, intra_planar_half_func* planar_hor = planar_func_table[0][log2_width]; intra_planar_half_func* planar_ver = planar_func_table[1][log2_width]; - planar_hor(ref_left, height, log2_height, v_pred_hor); - planar_ver(ref_top, height, log2_width, v_pred_ver); + planar_hor(ref_left, ref_top, height, log2_height, v_pred_hor); + planar_ver(ref_top, ref_left, height, log2_width, v_pred_ver); // debug int16_t* hor_res = (int16_t*)v_pred_hor; From 33d7b8348175630afab4a4192e43b0b1601443b0 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 14 Sep 2023 16:36:14 +0300 Subject: [PATCH 016/237] Some code cleanup and placeholders. --- src/strategies/avx2/intra-avx2.c | 57 ++++++++++++-------------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a4ef4747..9839c48f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -163,7 +163,7 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { * \param dst Buffer of size width*width. * \param multi_ref_idx Reference line index for use with MRL. */ -static void uvg_angular_pred_avx2( +static void uvg_angular_pred_avx2_old( const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, const int_fast8_t channel_type, @@ -255,9 +255,8 @@ static void uvg_angular_pred_avx2( { 0, 2, 63, -1 }, }; - // Temporary buffer for modes 11-25. - // It only needs to be big enough to hold indices from -width to width-1. - //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 }; + // Temporary buffer for modes 11-25. + // It only needs to be big enough to hold indices from -width to width-1. uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; @@ -521,38 +520,6 @@ static void uvg_angular_pred_avx2( *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vdst, 3); } } - - /* - if (pred_mode == 2 || pred_mode == 66) { - int wT = 16 >> MIN(31, ((y << 1) >> scale)); - for (int x = 0; x < width; x++) { - int wL = 16 >> MIN(31, ((x << 1) >> scale)); - if (wT + wL == 0) break; - int c = x + y + 1; - if (c >= 2 * width) { wL = 0; } - if (c >= 2 * width) { wT = 0; } - const uvg_pixel left = (wL != 0) ? ref_side[c] : 0; - const uvg_pixel top = (wT != 0) ? ref_main[c] : 0; - dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6); - } - } else if (sample_disp == 0 || sample_disp >= 12) { - int inv_angle_sum_0 = 2; - for (int x = 0; x < width; x++) { - inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)]; - int delta_pos_0 = inv_angle_sum_0 >> 2; - int delta_frac_0 = delta_pos_0 & 63; - int delta_int_0 = delta_pos_0 >> 6; - int delta_y = y + delta_int_0 + 1; - // TODO: convert to JVET_K0500_WAIP - if (delta_y > width + width - 1) break; - - int wL = 32 >> MIN(31, ((x << 1) >> scale)); - if (wL == 0) break; - const uvg_pixel *p = ref_side + delta_y - 1; - uvg_pixel left = p[delta_frac_0 >> 5]; - dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6); - } - }*/ } } else { @@ -612,6 +579,22 @@ static void uvg_angular_pred_avx2( } } + +static void uvg_angular_pred_avx2( + const cu_loc_t* const cu_loc, + const int_fast8_t intra_mode, + const int_fast8_t channel_type, + const uvg_pixel* const in_ref_above, + const uvg_pixel* const in_ref_left, + uvg_pixel* const dst, + const uint8_t multi_ref_idx, + const uint8_t isp_mode, + const int cu_dim) +{ + +} + + /** * \brief Generate planar prediction. * \param cu_loc CU location and size data. @@ -1588,6 +1571,8 @@ static void uvg_intra_pred_filtered_dc_avx2( * \param used_ref Pointer used reference pixel struct. * \param dst Buffer of size width*width. */ +// TODO: does not work with blocks with height 1 and 2 +// TODO: also has width someplaces where height should be static void uvg_pdpc_planar_dc_avx2( const int mode, const cu_loc_t* const cu_loc, From a29534800a82c51692e2fc69373f59530506abe3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 15 Sep 2023 14:08:16 +0300 Subject: [PATCH 017/237] Copy contents from old intra angular avx2 to new one. Change width to height in loops which need it. --- src/strategies/avx2/intra-avx2.c | 359 +++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9839c48f..dfa0fb4a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -590,8 +590,367 @@ static void uvg_angular_pred_avx2( const uint8_t multi_ref_idx, const uint8_t isp_mode, const int cu_dim) + { + // ISP_TODO: non-square block implementation, height is passed but not used + const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + // TODO: extend limits to include height 1 and 2, and dim 64 for both + assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5)); + assert(intra_mode >= 2 && intra_mode <= 66); + + uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; + uint8_t isp = isp_mode; + + __m256i p_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + + __m256i p_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e + ); + + __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + ); + + __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + ); + + static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; + static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp + static const int32_t pre_scale[] = { 8, 7, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -2, -3 }; + + static const int16_t cubic_filter[32][4] = + { + { 0, 64, 0, 0 }, + { -1, 63, 2, 0 }, + { -2, 62, 4, 0 }, + { -2, 60, 7, -1 }, + { -2, 58, 10, -2 }, + { -3, 57, 12, -2 }, + { -4, 56, 14, -2 }, + { -4, 55, 15, -2 }, + { -4, 54, 16, -2 }, + { -5, 53, 18, -2 }, + { -6, 52, 20, -2 }, + { -6, 49, 24, -3 }, + { -6, 46, 28, -4 }, + { -5, 44, 29, -4 }, + { -4, 42, 30, -4 }, + { -4, 39, 33, -4 }, + { -4, 36, 36, -4 }, + { -4, 33, 39, -4 }, + { -4, 30, 42, -4 }, + { -4, 29, 44, -5 }, + { -4, 28, 46, -6 }, + { -3, 24, 49, -6 }, + { -2, 20, 52, -6 }, + { -2, 18, 53, -5 }, + { -2, 16, 54, -4 }, + { -2, 15, 55, -4 }, + { -2, 14, 56, -4 }, + { -2, 12, 57, -3 }, + { -2, 10, 58, -2 }, + { -1, 7, 60, -2 }, + { 0, 4, 62, -2 }, + { 0, 2, 63, -1 }, + }; + + // Temporary buffer for modes 11-25. + // It only needs to be big enough to hold indices from -width to width-1. + uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; + uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; + + int32_t pred_mode = intra_mode; // ToDo: handle WAIP + + // Whether to swap references to always project on the left reference row. + const bool vertical_mode = intra_mode >= 34; + // Modes distance to horizontal or vertical mode. + const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -(pred_mode - 18); + //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode; + + // Sample displacement per column in fractions of 32. + const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; + + // TODO: replace latter width with height + int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]); + + // Pointer for the reference we are interpolating from. + uvg_pixel* ref_main; + // Pointer for the other reference. + const uvg_pixel* ref_side; + + // Set ref_main and ref_side such that, when indexed with 0, they point to + // index 0 in block coordinates. + if (sample_disp < 0) { + memcpy(&temp_main[width], vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); + memcpy(&temp_side[width], vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); + + ref_main = temp_main + width; + ref_side = temp_side + width; + + for (int i = -width; i <= -1; i++) { + ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)]; + } + } + else { + memcpy(temp_main, vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width * 2 + multi_ref_index + 1)); + memcpy(temp_side, vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width * 2 + multi_ref_index + 1)); + + const int s = 0; + const int max_index = (multi_ref_index << s) + 2; + const int ref_length = width << 1; + const uvg_pixel val = temp_main[ref_length + multi_ref_index]; + memset(temp_main + ref_length + multi_ref_index, val, max_index + 1); + + ref_main = temp_main; + ref_side = temp_side; + } + + // compensate for line offset in reference line buffers + ref_main += multi_ref_index; + ref_side += multi_ref_index; + + static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; + int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; + int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); + + bool use_cubic = true; // Default to cubic filter + if (dist_from_vert_or_hor > filter_threshold) { + if ((abs(sample_disp) & 0x1F) != 0) + { + use_cubic = false; + } + } + // Cubic must be used if ref line != 0 or if isp mode != 0 + if (multi_ref_index || isp) { + use_cubic = true; + } + + if (sample_disp != 0) { + // The mode is not horizontal or vertical, we have to do interpolation. + + int_fast32_t delta_pos = sample_disp * multi_ref_index; + int64_t delta_int[4] = { 0 }; + int16_t delta_fract[4] = { 0 }; + for (int_fast32_t y = 0; y + 3 < height; y += 4) { + + for (int yy = 0; yy < 4; ++yy) { + delta_pos += sample_disp; + delta_int[yy] = delta_pos >> 5; + delta_fract[yy] = delta_pos & (32 - 1); + } + + if ((abs(sample_disp) & 0x1F) != 0) { + + // Luma Channel + if (channel_type == 0) { + int16_t f[4][4] = { { 0 } }; + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[0]], 8); + memcpy(f[1], cubic_filter[delta_fract[1]], 8); + memcpy(f[2], cubic_filter[delta_fract[2]], 8); + memcpy(f[3], cubic_filter[delta_fract[3]], 8); + } + else { + for (int yy = 0; yy < 4; ++yy) { + const int16_t offset = (delta_fract[yy] >> 1); + f[yy][0] = 16 - offset; + f[yy][1] = 32 - offset; + f[yy][2] = 16 + offset; + f[yy][3] = offset; + } + } + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)ref_main; + __m256i vidx = _mm256_loadu_si256((__m256i*)delta_int); + __m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + } + } + else { + + // Do linear filtering + for (int yy = 0; yy < 4; ++yy) { + for (int_fast32_t x = 0; x < width; ++x) { + uvg_pixel ref1 = ref_main[x + delta_int[yy] + 1]; + uvg_pixel ref2 = ref_main[x + delta_int[yy] + 2]; + dst[(y + yy) * width + x] = ref1 + ((delta_fract[yy] * (ref2 - ref1) + 16) >> 5); + } + } + } + } + else { + // Just copy the integer samples + for (int yy = 0; yy < 4; ++yy) { + uvg_pixel* dst_row = dst + (y + yy) * width; + uvg_pixel* ref_row = ref_main + delta_int[yy] + 1; + for (int_fast32_t x = 0; x + 3 < width; x += 4) { + memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); + } + } + } + + + // PDPC + bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); + if (pred_mode > 1 && pred_mode < 67) { + if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. + PDPC_filter = false; + } + else if (mode_disp > 0) { + PDPC_filter &= (scale >= 0); + } + } + if (PDPC_filter) { + + int16_t wL[4]; + int16_t left[4][4]; + + int limit = MIN(3 << scale, width); + + for (int x = 0; x < limit; x += 4) { + + for (int xx = 0; xx < 4; ++xx) { + int inv_angle_sum = 256 + (x + xx + 1) * modedisp2invsampledisp[abs(mode_disp)]; + wL[xx] = 32 >> (2 * (x + xx) >> scale); + + for (int yy = 0; yy < 4; ++yy) { + left[yy][xx] = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1]; + } + } + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + uint64_t quad; + memcpy(&quad, wL, sizeof(quad)); + __m256i vwL = _mm256_set1_epi64x(quad); + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vwL, accu); + accu = _mm256_add_epi16(accu, _mm256_set1_epi16(32)); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // Need to mask remainder samples on the last iteration when limit % 4 != 0 + int rem_bits = 8 * (limit - x); + __m128i ones = _mm_set1_epi32(0xFF); + __m128i vmask = _mm_slli_epi32(ones, rem_bits); + + // 0 selects filtered, 1 vdst (unchanged) + vdst = _mm_blendv_epi8(filtered, vdst, vmask); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vdst, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vdst, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vdst, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vdst, 3); + } + } + } + } + else { + // Mode is horizontal or vertical, just copy the pixels. + + // TODO: update outer loop to use height instead of width + for (int_fast32_t y = 0; y < height; ++y) { + for (int_fast32_t x = 0; x < width; ++x) { + dst[y * width + x] = ref_main[x + 1]; + } + if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { + int scale = (log2_width + log2_width - 2) >> 2; + const uvg_pixel top_left = ref_main[0]; + const uvg_pixel left = ref_side[1 + y]; + for (int i = 0; i < MIN(3 << scale, width); i++) { + const int wL = 32 >> (2 * i >> scale); + const uvg_pixel val = dst[y * width + i]; + dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); + } + } + } + } + + // TODO: to get rid of this transpose, do a separate implementation for horizontal and vertical modes + // Flip the block if this is was a horizontal mode. + if (!vertical_mode) { + + const __m128i vtranspose_mask = _mm_setr_epi8( + 0, 4, 8, 12, + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15 + ); + + const __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + const __m128i vidx = _mm_slli_epi32(vseq, log2_width); + + // Transpose as 4x4 subblocks + for (int_fast32_t y = 0; y + 3 < width; y += 4) { + for (int_fast32_t x = y; x + 3 < width; x += 4) { + + __m128i vtemp4x4 = _mm_i32gather_epi32((const int32_t*)(dst + x * width + y), vidx, 1); + __m128i v4x4 = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); + vtemp4x4 = _mm_shuffle_epi8(vtemp4x4, vtranspose_mask); + v4x4 = _mm_shuffle_epi8(v4x4, vtranspose_mask); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vtemp4x4, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vtemp4x4, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vtemp4x4, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vtemp4x4, 3); + + *(uint32_t*)(dst + (x + 0) * width + y) = _mm_extract_epi32(v4x4, 0); + *(uint32_t*)(dst + (x + 1) * width + y) = _mm_extract_epi32(v4x4, 1); + *(uint32_t*)(dst + (x + 2) * width + y) = _mm_extract_epi32(v4x4, 2); + *(uint32_t*)(dst + (x + 3) * width + y) = _mm_extract_epi32(v4x4, 3); + } + } + } } From 01fb6a81c4aef2e4b3f34d3bbebb3baa2e6252dc Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 19 Sep 2023 13:33:19 +0300 Subject: [PATCH 018/237] Modify avx2 function to work with non-square blocks. Pure horizontal and vertical modes now work correctly for all block sizes. Other modes still fail. --- src/strategies/avx2/intra-avx2.c | 79 +++++++++++++++++++------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index dfa0fb4a..bdbafa06 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -43,6 +43,7 @@ #include #include +#include "global.h" #include "intra-avx2.h" #include "strategyselector.h" @@ -593,13 +594,12 @@ static void uvg_angular_pred_avx2( { // ISP_TODO: non-square block implementation, height is passed but not used - const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; const int log2_width = uvg_g_convert_to_log2[width]; const int log2_height = uvg_g_convert_to_log2[height]; - // TODO: extend limits to include height 1 and 2, and dim 64 for both - assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5)); + assert((log2_width >= 2 && log2_width <= 6) && (log2_height >= 0 && log2_height <= 6)); assert(intra_mode >= 2 && intra_mode <= 66); uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; @@ -697,6 +697,9 @@ static void uvg_angular_pred_avx2( // Pointer for the other reference. const uvg_pixel* ref_side; + const int top_ref_length = isp_mode == ISP_MODE_VER ? width + cu_dim : width << 1; + const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1; + // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. if (sample_disp < 0) { @@ -711,22 +714,32 @@ static void uvg_angular_pred_avx2( } } else { - memcpy(temp_main, vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width * 2 + multi_ref_index + 1)); - memcpy(temp_side, vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width * 2 + multi_ref_index + 1)); + memcpy(&temp_main[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); + memcpy(&temp_side[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); - const int s = 0; - const int max_index = (multi_ref_index << s) + 2; - const int ref_length = width << 1; - const uvg_pixel val = temp_main[ref_length + multi_ref_index]; - memset(temp_main + ref_length + multi_ref_index, val, max_index + 1); + ref_main = vertical_mode ? temp_main : temp_side; + ref_side = vertical_mode ? temp_side : temp_main; - ref_main = temp_main; - ref_side = temp_side; + const int log2_ratio = log2_width - log2_height; + const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio); + const int max_index = (multi_ref_index << s) + 2; + int ref_length; + if (isp_mode) { + ref_length = vertical_mode ? top_ref_length : left_ref_length; + } + else { + ref_length = vertical_mode ? width << 1 : height << 1; + } + const uvg_pixel val = ref_main[ref_length + multi_ref_index]; + for (int j = 1; j <= max_index; j++) { + ref_main[ref_length + multi_ref_index + j] = val; + } } // compensate for line offset in reference line buffers ref_main += multi_ref_index; ref_side += multi_ref_index; + if (!vertical_mode) { SWAP(width, height, int) } static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; @@ -836,10 +849,12 @@ static void uvg_angular_pred_avx2( // PDPC bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); if (pred_mode > 1 && pred_mode < 67) { - if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. + // Disable PDPC filter if both references are used or if MRL is used + if (mode_disp < 0 || multi_ref_index) { PDPC_filter = false; } else if (mode_disp > 0) { + // If scale is negative, PDPC filtering has no effect, therefore disable it. PDPC_filter &= (scale >= 0); } } @@ -903,8 +918,8 @@ static void uvg_angular_pred_avx2( for (int_fast32_t x = 0; x < width; ++x) { dst[y * width + x] = ref_main[x + 1]; } - if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - int scale = (log2_width + log2_width - 2) >> 2; + if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { + int scale = (log2_width + log2_height - 2) >> 2; const uvg_pixel top_left = ref_main[0]; const uvg_pixel left = ref_side[1 + y]; for (int i = 0; i < MIN(3 << scale, width); i++) { @@ -930,24 +945,22 @@ static void uvg_angular_pred_avx2( const __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); const __m128i vidx = _mm_slli_epi32(vseq, log2_width); - // Transpose as 4x4 subblocks - for (int_fast32_t y = 0; y + 3 < width; y += 4) { - for (int_fast32_t x = y; x + 3 < width; x += 4) { - - __m128i vtemp4x4 = _mm_i32gather_epi32((const int32_t*)(dst + x * width + y), vidx, 1); - __m128i v4x4 = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); - vtemp4x4 = _mm_shuffle_epi8(vtemp4x4, vtranspose_mask); - v4x4 = _mm_shuffle_epi8(v4x4, vtranspose_mask); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vtemp4x4, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vtemp4x4, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vtemp4x4, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vtemp4x4, 3); + // Brute force transpose, works with all block sizes + uvg_pixel tmp[PRED_BUF_SIZE]; + memcpy(tmp, dst, (sizeof(uvg_pixel) * (width * height))); - *(uint32_t*)(dst + (x + 0) * width + y) = _mm_extract_epi32(v4x4, 0); - *(uint32_t*)(dst + (x + 1) * width + y) = _mm_extract_epi32(v4x4, 1); - *(uint32_t*)(dst + (x + 2) * width + y) = _mm_extract_epi32(v4x4, 2); - *(uint32_t*)(dst + (x + 3) * width + y) = _mm_extract_epi32(v4x4, 3); + if (width == height) { + for (int_fast32_t y = 0; y < height - 1; ++y) { + for (int_fast32_t x = y + 1; x < width; ++x) { + SWAP(dst[y * height + x], dst[x * width + y], uvg_pixel); + } + } + } + else { + for (int y = 0; y < width; ++y) { + for (int x = 0; x < height; ++x) { + dst[x + y * height] = tmp[y + x * width]; + } } } } From 580af764d8afd797645a41ce70014a6137ea94b8 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 19 Sep 2023 16:07:45 +0300 Subject: [PATCH 019/237] Fix error in references. Some block sizes still fail. --- src/strategies/avx2/intra-avx2.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index bdbafa06..f91f029f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -703,14 +703,15 @@ static void uvg_angular_pred_avx2( // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. if (sample_disp < 0) { - memcpy(&temp_main[width], vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); - memcpy(&temp_side[width], vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); + memcpy(&temp_main[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel)); + memcpy(&temp_side[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel)); - ref_main = temp_main + width; - ref_side = temp_side + width; + ref_main = vertical_mode ? temp_main + height : temp_side + width; + ref_side = vertical_mode ? temp_side + width : temp_main + height; - for (int i = -width; i <= -1; i++) { - ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)]; + int size_side = vertical_mode ? height : width; + for (int i = -size_side; i <= -1; i++) { + ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)]; } } else { From d5506f21531da438ee7243e7bd821f62da7ebcb6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 20 Sep 2023 13:00:42 +0300 Subject: [PATCH 020/237] Fix error in filter threshold calculation. Height was not considered. --- src/strategies/avx2/intra-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f91f029f..954c43d1 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -743,7 +743,7 @@ static void uvg_angular_pred_avx2( if (!vertical_mode) { SWAP(width, height, int) } static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; - int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; + int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1]; int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); bool use_cubic = true; // Default to cubic filter From a9108dded74c01ab2c856f2481678f5d951b1d03 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 20 Sep 2023 15:01:01 +0300 Subject: [PATCH 021/237] Add if branch to handle 1 & 2 height horizontal mode predictions. Put the generic solution inside as a placeholder. --- src/strategies/avx2/intra-avx2.c | 311 +++++++++++++++++++------------ 1 file changed, 196 insertions(+), 115 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 954c43d1..485d34bd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -581,6 +581,61 @@ static void uvg_angular_pred_avx2_old( } +static void angular_pdpc_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int_fast8_t mode_disp, const int16_t inv_sample_disp) +{ + int16_t wL[4]; + int16_t left[4][4]; + + int limit = MIN(3 << scale, width); + const int log2_width = uvg_g_convert_to_log2[width]; + + + for (int_fast32_t y = 0; y + 3 < height; y += 4) { + for (int x = 0; x < limit; x += 4) { + + for (int xx = 0; xx < 4; ++xx) { + int inv_angle_sum = 256 + (x + xx + 1) * inv_sample_disp; + wL[xx] = 32 >> (2 * (x + xx) >> scale); + + for (int yy = 0; yy < 4; ++yy) { + left[yy][xx] = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1]; + } + } + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + uint64_t quad; + memcpy(&quad, wL, sizeof(quad)); + __m256i vwL = _mm256_set1_epi64x(quad); + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vwL, accu); + accu = _mm256_add_epi16(accu, _mm256_set1_epi16(32)); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // Need to mask remainder samples on the last iteration when limit % 4 != 0 + int rem_bits = 8 * (limit - x); + __m128i ones = _mm_set1_epi32(0xFF); + __m128i vmask = _mm_slli_epi32(ones, rem_bits); + + // 0 selects filtered, 1 vdst (unchanged) + vdst = _mm_blendv_epi8(filtered, vdst, vmask); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vdst, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vdst, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vdst, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vdst, 3); + } + } +} + static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, @@ -761,152 +816,159 @@ static void uvg_angular_pred_avx2( if (sample_disp != 0) { // The mode is not horizontal or vertical, we have to do interpolation. + // These are for the height >= 4 version int_fast32_t delta_pos = sample_disp * multi_ref_index; int64_t delta_int[4] = { 0 }; int16_t delta_fract[4] = { 0 }; - for (int_fast32_t y = 0; y + 3 < height; y += 4) { - for (int yy = 0; yy < 4; ++yy) { - delta_pos += sample_disp; - delta_int[yy] = delta_pos >> 5; - delta_fract[yy] = delta_pos & (32 - 1); - } + // Special cases for height 1 & 2 + if (height < 4) { + for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) { - if ((abs(sample_disp) & 0x1F) != 0) { + int_fast32_t delta_int = delta_pos >> 5; + int_fast32_t delta_fract = delta_pos & (32 - 1); + const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 }; + int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff; - // Luma Channel - if (channel_type == 0) { - int16_t f[4][4] = { { 0 } }; - - if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[0]], 8); - memcpy(f[1], cubic_filter[delta_fract[1]], 8); - memcpy(f[2], cubic_filter[delta_fract[2]], 8); - memcpy(f[3], cubic_filter[delta_fract[3]], 8); - } - else { - for (int yy = 0; yy < 4; ++yy) { - const int16_t offset = (delta_fract[yy] >> 1); - f[yy][0] = 16 - offset; - f[yy][1] = 32 - offset; - f[yy][2] = 16 + offset; - f[yy][3] = offset; - } - } - - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)ref_main; - __m256i vidx = _mm256_loadu_si256((__m256i*)delta_int); - __m256i all_weights = _mm256_loadu_si256((__m256i*)f); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + if ((abs(sample_disp) & 0x1F) != 0) { - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + // Luma Channel + if (channel_type == 0) { + int32_t ref_main_index = delta_int; + uvg_pixel p[4]; - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + // Do 4-tap intra interpolation filtering + for (int_fast32_t x = 0; x < width; x++, ref_main_index++) { + p[0] = ref_main[ref_main_index]; + p[1] = ref_main[ref_main_index + 1]; + p[2] = ref_main[ref_main_index + 2]; + p[3] = ref_main[ref_main_index + 3]; - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); + dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6); - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + } } - } - else { + else { - // Do linear filtering - for (int yy = 0; yy < 4; ++yy) { + // Do linear filtering for (int_fast32_t x = 0; x < width; ++x) { - uvg_pixel ref1 = ref_main[x + delta_int[yy] + 1]; - uvg_pixel ref2 = ref_main[x + delta_int[yy] + 2]; - dst[(y + yy) * width + x] = ref1 + ((delta_fract[yy] * (ref2 - ref1) + 16) >> 5); + uvg_pixel ref1 = ref_main[x + delta_int + 1]; + uvg_pixel ref2 = ref_main[x + delta_int + 2]; + dst[y * width + x] = ref1 + ((delta_fract * (ref2 - ref1) + 16) >> 5); } } } - } - else { - // Just copy the integer samples - for (int yy = 0; yy < 4; ++yy) { - uvg_pixel* dst_row = dst + (y + yy) * width; - uvg_pixel* ref_row = ref_main + delta_int[yy] + 1; - for (int_fast32_t x = 0; x + 3 < width; x += 4) { - memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); + else { + // Just copy the integer samples + for (int_fast32_t x = 0; x < width; x++) { + dst[y * width + x] = ref_main[x + delta_int + 1]; } } - } - // PDPC - bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); - if (pred_mode > 1 && pred_mode < 67) { - // Disable PDPC filter if both references are used or if MRL is used - if (mode_disp < 0 || multi_ref_index) { - PDPC_filter = false; + // PDPC + bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0; + if (pred_mode > 1 && pred_mode < 67) { + if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. + PDPC_filter = false; + } + else if (mode_disp > 0) { + PDPC_filter &= (scale >= 0); + } } - else if (mode_disp > 0) { - // If scale is negative, PDPC filtering has no effect, therefore disable it. - PDPC_filter &= (scale >= 0); + if (PDPC_filter) { + int inv_angle_sum = 256; + for (int x = 0; x < MIN(3 << scale, width); x++) { + inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)]; + + int wL = 32 >> (2 * x >> scale); + const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1]; + dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6); + } } } - if (PDPC_filter) { + } + else { + for (int_fast32_t y = 0; y + 3 < height; y += 4) { - int16_t wL[4]; - int16_t left[4][4]; + for (int yy = 0; yy < 4; ++yy) { + delta_pos += sample_disp; + delta_int[yy] = delta_pos >> 5; + delta_fract[yy] = delta_pos & (32 - 1); + } - int limit = MIN(3 << scale, width); + if ((abs(sample_disp) & 0x1F) != 0) { - for (int x = 0; x < limit; x += 4) { + // Luma Channel + if (channel_type == 0) { + int16_t f[4][4] = { { 0 } }; - for (int xx = 0; xx < 4; ++xx) { - int inv_angle_sum = 256 + (x + xx + 1) * modedisp2invsampledisp[abs(mode_disp)]; - wL[xx] = 32 >> (2 * (x + xx) >> scale); + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[0]], 8); + memcpy(f[1], cubic_filter[delta_fract[1]], 8); + memcpy(f[2], cubic_filter[delta_fract[2]], 8); + memcpy(f[3], cubic_filter[delta_fract[3]], 8); + } + else { + for (int yy = 0; yy < 4; ++yy) { + const int16_t offset = (delta_fract[yy] >> 1); + f[yy][0] = 16 - offset; + f[yy][1] = 32 - offset; + f[yy][2] = 16 + offset; + f[yy][3] = offset; + } + } - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1]; + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)ref_main; + __m256i vidx = _mm256_loadu_si256((__m256i*)delta_int); + __m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); } } + else { - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - uint64_t quad; - memcpy(&quad, wL, sizeof(quad)); - __m256i vwL = _mm256_set1_epi64x(quad); - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vwL, accu); - accu = _mm256_add_epi16(accu, _mm256_set1_epi16(32)); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // Need to mask remainder samples on the last iteration when limit % 4 != 0 - int rem_bits = 8 * (limit - x); - __m128i ones = _mm_set1_epi32(0xFF); - __m128i vmask = _mm_slli_epi32(ones, rem_bits); - - // 0 selects filtered, 1 vdst (unchanged) - vdst = _mm_blendv_epi8(filtered, vdst, vmask); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vdst, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vdst, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vdst, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vdst, 3); + // Do linear filtering + for (int yy = 0; yy < 4; ++yy) { + for (int_fast32_t x = 0; x < width; ++x) { + uvg_pixel ref1 = ref_main[x + delta_int[yy] + 1]; + uvg_pixel ref2 = ref_main[x + delta_int[yy] + 2]; + dst[(y + yy) * width + x] = ref1 + ((delta_fract[yy] * (ref2 - ref1) + 16) >> 5); + } + } + } + } + else { + // Just copy the integer samples + for (int yy = 0; yy < 4; ++yy) { + uvg_pixel* dst_row = dst + (y + yy) * width; + uvg_pixel* ref_row = ref_main + delta_int[yy] + 1; + for (int_fast32_t x = 0; x + 3 < width; x += 4) { + memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); + } + } } } } @@ -932,6 +994,25 @@ static void uvg_angular_pred_avx2( } } + // PDPC for non-horizontal and non-vertical modes + + if (!(pred_mode == 18 || pred_mode == 50)) { + bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); + if (pred_mode > 1 && pred_mode < 67) { + // Disable PDPC filter if both references are used or if MRL is used + if (mode_disp < 0 || multi_ref_index) { + PDPC_filter = false; + } + else if (mode_disp > 0) { + // If scale is negative, PDPC filtering has no effect, therefore disable it. + PDPC_filter &= (scale >= 0); + } + } + if (PDPC_filter) { + angular_pdpc_avx2(dst, ref_side, width, height, scale, mode_disp, modedisp2invsampledisp[abs(mode_disp)]); + } + } + // TODO: to get rid of this transpose, do a separate implementation for horizontal and vertical modes // Flip the block if this is was a horizontal mode. if (!vertical_mode) { From 57699bf695efefcba2a6859849ea8545ad7fea54 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 21 Sep 2023 15:26:16 +0300 Subject: [PATCH 022/237] Fix issue with calculating scale. Height was not taken into account. --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 485d34bd..f15cd834 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -744,8 +744,8 @@ static void uvg_angular_pred_avx2( // Sample displacement per column in fractions of 32. const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; - // TODO: replace latter width with height - int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]); + const int side_size = vertical_mode ? log2_height : log2_width; + int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]); // Pointer for the reference we are interpolating from. uvg_pixel* ref_main; From 1c29f1e2b6f0435ad0468a5128d856adf2837b80 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 21 Sep 2023 16:44:20 +0300 Subject: [PATCH 023/237] Fix avx2 height 1 & 2 for horizontal modes. --- src/strategies/avx2/intra-avx2.c | 59 +++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f15cd834..10c1e8dd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -926,26 +926,45 @@ static void uvg_angular_pred_avx2( __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { - - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); - - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); - - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + // Special case. Original height was < 4 and dimensions have been swapped -> current width < 4 (horizontal mode). + if (width < 4) { + for (int_fast32_t x = 0; x < width; ++x) { + uvg_pixel ref[4][4]; + for (int r = 0; r < 4; ++r) { + ref[0][r] = ref_main[(delta_int[0] + x) + r]; + ref[1][r] = ref_main[(delta_int[1] + x) + r]; + ref[2][r] = ref_main[(delta_int[2] + x) + r]; + ref[3][r] = ref_main[(delta_int[3] + x) + r]; + } + + dst[(y + 0) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0][0] * ref[0][0]) + (int32_t)(f[0][1] * ref[0][1]) + (int32_t)(f[0][2] * ref[0][2]) + (int32_t)(f[0][3] * ref[0][3]) + 32) >> 6);; + dst[(y + 1) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[1][0] * ref[1][0]) + (int32_t)(f[1][1] * ref[1][1]) + (int32_t)(f[1][2] * ref[1][2]) + (int32_t)(f[1][3] * ref[1][3]) + 32) >> 6);; + dst[(y + 2) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[2][0] * ref[2][0]) + (int32_t)(f[2][1] * ref[2][1]) + (int32_t)(f[2][2] * ref[2][2]) + (int32_t)(f[2][3] * ref[2][3]) + 32) >> 6);; + dst[(y + 3) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[3][0] * ref[3][0]) + (int32_t)(f[3][1] * ref[3][1]) + (int32_t)(f[3][2] * ref[3][2]) + (int32_t)(f[3][3] * ref[3][3]) + 32) >> 6);; + } + } + else { + for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + } } } else { From 7b09ce01d2e05b44cf2f1e8a82e07d1248f64ab6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 25 Oct 2023 15:49:52 +0300 Subject: [PATCH 024/237] Fix error in intra mode 2. --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 10c1e8dd..dcdf6276 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -181,7 +181,7 @@ static void uvg_angular_pred_avx2_old( const int log2_width = uvg_g_convert_to_log2[width]; const int log2_height = uvg_g_convert_to_log2[height]; - assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5)); + assert((log2_width >= 2 && log2_width <= 6) && (log2_height <= 6)); assert(intra_mode >= 2 && intra_mode <= 66); // TODO: implement handling of MRL @@ -984,7 +984,7 @@ static void uvg_angular_pred_avx2( for (int yy = 0; yy < 4; ++yy) { uvg_pixel* dst_row = dst + (y + yy) * width; uvg_pixel* ref_row = ref_main + delta_int[yy] + 1; - for (int_fast32_t x = 0; x + 3 < width; x += 4) { + for (int_fast32_t x = 0; x < width; x += 4) { memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); } } From 3b5bdc59d432d1a83a1ea873230f3ed4a7ddd502 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 2 Oct 2023 16:59:22 +0300 Subject: [PATCH 025/237] Reworking intra avx2. Implement width 4 vertical. --- src/strategies/avx2/intra-avx2.c | 466 +++++++++++++------------------ 1 file changed, 198 insertions(+), 268 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index dcdf6276..a4acfee5 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -581,6 +581,164 @@ static void uvg_angular_pred_avx2_old( } +// TODO: vectorize +static void angular_pred_avx2_delta_tables(int64_t* delta_int, int32_t* delta_fract, const int line, const int mrl, const int sample_disp) +{ + for (int i = 0, delta_pos = sample_disp * (1 + mrl); i < line; ++i, delta_pos += sample_disp) { + delta_int[i] = delta_pos >> 5; + delta_fract[i] = delta_pos & (32 - 1); + } +} + + +static const int16_t cubic_filter[32][4] = +{ + { 0, 64, 0, 0 }, + { -1, 63, 2, 0 }, + { -2, 62, 4, 0 }, + { -2, 60, 7, -1 }, + { -2, 58, 10, -2 }, + { -3, 57, 12, -2 }, + { -4, 56, 14, -2 }, + { -4, 55, 15, -2 }, + { -4, 54, 16, -2 }, + { -5, 53, 18, -2 }, + { -6, 52, 20, -2 }, + { -6, 49, 24, -3 }, + { -6, 46, 28, -4 }, + { -5, 44, 29, -4 }, + { -4, 42, 30, -4 }, + { -4, 39, 33, -4 }, + { -4, 36, 36, -4 }, + { -4, 33, 39, -4 }, + { -4, 30, 42, -4 }, + { -4, 29, 44, -5 }, + { -4, 28, 46, -6 }, + { -3, 24, 49, -6 }, + { -2, 20, 52, -6 }, + { -2, 18, 53, -5 }, + { -2, 16, 54, -4 }, + { -2, 15, 55, -4 }, + { -2, 14, 56, -4 }, + { -2, 12, 57, -3 }, + { -2, 10, 58, -2 }, + { -1, 7, 60, -2 }, + { 0, 4, 62, -2 }, + { 0, 2, 63, -1 }, +}; + + +static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int64_t* delta_int, const int32_t* delta_fract, const int height, const int use_cubic) +{ + const int width = 4; + + const __m256i p_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + + const __m256i p_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e + ); + + const __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + ); + + const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + ); + + int16_t f[4][4] = { { 0 } }; + + // For a 4 width block, height must be at least 4. Handle 4 lines at once + for (int y = 0; y < height; y += 4) + { + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[y + 0]], 8); + memcpy(f[1], cubic_filter[delta_fract[y + 1]], 8); + memcpy(f[2], cubic_filter[delta_fract[y + 2]], 8); + memcpy(f[3], cubic_filter[delta_fract[y + 3]], 8); + } + else { + for (int yy = 0; yy < 4; ++yy) { + const int16_t offset = (delta_fract[y + yy] >> 1); + f[yy][0] = 16 - offset; + f[yy][1] = 32 - offset; + f[yy][2] = 16 + offset; + f[yy][3] = offset; + } + } + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)ref_main; + __m256i vidx = _mm256_loadu_si256((__m256i*)&delta_int[y]); + __m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + } + } +} + + +// TODO: vectorize +static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int64_t* delta_int, const int32_t* delta_fract) +{ + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + uvg_pixel ref1 = ref[x + delta_int[y] + 1]; + uvg_pixel ref2 = ref[x + delta_int[y] + 2]; + dst[y * width + x] = ref1 + ((delta_fract[y] * (ref2 - ref1) + 16) >> 5); + } + } +} + + +// TODO: vectorize +static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int64_t* delta_int) +{ + for (int y = 0; y < height; ++y) { + uvg_pixel* dst_row = dst + y * width; + uvg_pixel* ref_row = ref + delta_int[y] + 1; + for (int_fast32_t x = 0; x + 3 < width; x += 4) { + memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); + } + } +} + + static void angular_pdpc_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int_fast8_t mode_disp, const int16_t inv_sample_disp) { int16_t wL[4]; @@ -660,74 +818,10 @@ static void uvg_angular_pred_avx2( uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; uint8_t isp = isp_mode; - __m256i p_shuf_01 = _mm256_setr_epi8( - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c - ); - - __m256i p_shuf_23 = _mm256_setr_epi8( - 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, - 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e - ); - - __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a - ); - - __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e - ); - static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp static const int32_t pre_scale[] = { 8, 7, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -2, -3 }; - static const int16_t cubic_filter[32][4] = - { - { 0, 64, 0, 0 }, - { -1, 63, 2, 0 }, - { -2, 62, 4, 0 }, - { -2, 60, 7, -1 }, - { -2, 58, 10, -2 }, - { -3, 57, 12, -2 }, - { -4, 56, 14, -2 }, - { -4, 55, 15, -2 }, - { -4, 54, 16, -2 }, - { -5, 53, 18, -2 }, - { -6, 52, 20, -2 }, - { -6, 49, 24, -3 }, - { -6, 46, 28, -4 }, - { -5, 44, 29, -4 }, - { -4, 42, 30, -4 }, - { -4, 39, 33, -4 }, - { -4, 36, 36, -4 }, - { -4, 33, 39, -4 }, - { -4, 30, 42, -4 }, - { -4, 29, 44, -5 }, - { -4, 28, 46, -6 }, - { -3, 24, 49, -6 }, - { -2, 20, 52, -6 }, - { -2, 18, 53, -5 }, - { -2, 16, 54, -4 }, - { -2, 15, 55, -4 }, - { -2, 14, 56, -4 }, - { -2, 12, 57, -3 }, - { -2, 10, 58, -2 }, - { -1, 7, 60, -2 }, - { 0, 4, 62, -2 }, - { 0, 2, 63, -1 }, - }; - // Temporary buffer for modes 11-25. // It only needs to be big enough to hold indices from -width to width-1. uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; @@ -795,7 +889,7 @@ static void uvg_angular_pred_avx2( // compensate for line offset in reference line buffers ref_main += multi_ref_index; ref_side += multi_ref_index; - if (!vertical_mode) { SWAP(width, height, int) } + //if (!vertical_mode) { SWAP(width, height, int) } static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1]; @@ -813,183 +907,54 @@ static void uvg_angular_pred_avx2( use_cubic = true; } - if (sample_disp != 0) { - // The mode is not horizontal or vertical, we have to do interpolation. - - // These are for the height >= 4 version - int_fast32_t delta_pos = sample_disp * multi_ref_index; - int64_t delta_int[4] = { 0 }; - int16_t delta_fract[4] = { 0 }; - - // Special cases for height 1 & 2 - if (height < 4) { - for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) { - - int_fast32_t delta_int = delta_pos >> 5; - int_fast32_t delta_fract = delta_pos & (32 - 1); - const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 }; - int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff; - - if ((abs(sample_disp) & 0x1F) != 0) { - - // Luma Channel - if (channel_type == 0) { - int32_t ref_main_index = delta_int; - uvg_pixel p[4]; - - // Do 4-tap intra interpolation filtering - for (int_fast32_t x = 0; x < width; x++, ref_main_index++) { - p[0] = ref_main[ref_main_index]; - p[1] = ref_main[ref_main_index + 1]; - p[2] = ref_main[ref_main_index + 2]; - p[3] = ref_main[ref_main_index + 3]; - dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6); - } - } - else { - - // Do linear filtering - for (int_fast32_t x = 0; x < width; ++x) { - uvg_pixel ref1 = ref_main[x + delta_int + 1]; - uvg_pixel ref2 = ref_main[x + delta_int + 2]; - dst[y * width + x] = ref1 + ((delta_fract * (ref2 - ref1) + 16) >> 5); - } + if (sample_disp != 0) { + // The mode is not horizontal or vertical, we have to do interpolation. + int64_t delta_int[MAX_PRED_WIDTH]; + int32_t delta_fract[MAX_PRED_WIDTH]; + + // TODO: for horizontal modes, these should be constructed using width instead of height + angular_pred_avx2_delta_tables(delta_int, delta_fract, height, multi_ref_index, sample_disp); + + // Check if the angle is fractional. If yes, interpolation is needed + if ((abs(sample_disp) & 0x1F) != 0) { + + // Luma Channel + if (channel_type == 0) { + if (vertical_mode) { + switch (width) { + case 4: angular_pred_avx2_w4_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 8: break; + case 16: break; + case 32: break; + case 64: break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; } } else { - // Just copy the integer samples - for (int_fast32_t x = 0; x < width; x++) { - dst[y * width + x] = ref_main[x + delta_int + 1]; - } - } - - - // PDPC - bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0; - if (pred_mode > 1 && pred_mode < 67) { - if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. - PDPC_filter = false; - } - else if (mode_disp > 0) { - PDPC_filter &= (scale >= 0); - } - } - if (PDPC_filter) { - int inv_angle_sum = 256; - for (int x = 0; x < MIN(3 << scale, width); x++) { - inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)]; - - int wL = 32 >> (2 * x >> scale); - const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1]; - dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6); + switch (width) { + case 4: break; + case 8: break; + case 16: break; + case 32: break; + case 64: break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; } } } + else { + // Do linear filtering for chroma channels + angular_pred_avx2_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); + } } else { - for (int_fast32_t y = 0; y + 3 < height; y += 4) { - - for (int yy = 0; yy < 4; ++yy) { - delta_pos += sample_disp; - delta_int[yy] = delta_pos >> 5; - delta_fract[yy] = delta_pos & (32 - 1); - } - - if ((abs(sample_disp) & 0x1F) != 0) { - - // Luma Channel - if (channel_type == 0) { - int16_t f[4][4] = { { 0 } }; - - if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[0]], 8); - memcpy(f[1], cubic_filter[delta_fract[1]], 8); - memcpy(f[2], cubic_filter[delta_fract[2]], 8); - memcpy(f[3], cubic_filter[delta_fract[3]], 8); - } - else { - for (int yy = 0; yy < 4; ++yy) { - const int16_t offset = (delta_fract[yy] >> 1); - f[yy][0] = 16 - offset; - f[yy][1] = 32 - offset; - f[yy][2] = 16 + offset; - f[yy][3] = offset; - } - } - - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)ref_main; - __m256i vidx = _mm256_loadu_si256((__m256i*)delta_int); - __m256i all_weights = _mm256_loadu_si256((__m256i*)f); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - - // Special case. Original height was < 4 and dimensions have been swapped -> current width < 4 (horizontal mode). - if (width < 4) { - for (int_fast32_t x = 0; x < width; ++x) { - uvg_pixel ref[4][4]; - for (int r = 0; r < 4; ++r) { - ref[0][r] = ref_main[(delta_int[0] + x) + r]; - ref[1][r] = ref_main[(delta_int[1] + x) + r]; - ref[2][r] = ref_main[(delta_int[2] + x) + r]; - ref[3][r] = ref_main[(delta_int[3] + x) + r]; - } - - dst[(y + 0) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0][0] * ref[0][0]) + (int32_t)(f[0][1] * ref[0][1]) + (int32_t)(f[0][2] * ref[0][2]) + (int32_t)(f[0][3] * ref[0][3]) + 32) >> 6);; - dst[(y + 1) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[1][0] * ref[1][0]) + (int32_t)(f[1][1] * ref[1][1]) + (int32_t)(f[1][2] * ref[1][2]) + (int32_t)(f[1][3] * ref[1][3]) + 32) >> 6);; - dst[(y + 2) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[2][0] * ref[2][0]) + (int32_t)(f[2][1] * ref[2][1]) + (int32_t)(f[2][2] * ref[2][2]) + (int32_t)(f[2][3] * ref[2][3]) + 32) >> 6);; - dst[(y + 3) * width + x] = CLIP_TO_PIXEL(((int32_t)(f[3][0] * ref[3][0]) + (int32_t)(f[3][1] * ref[3][1]) + (int32_t)(f[3][2] * ref[3][2]) + (int32_t)(f[3][3] * ref[3][3]) + 32) >> 6);; - } - } - else { - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { - - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); - - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); - - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); - } - } - } - else { - - // Do linear filtering - for (int yy = 0; yy < 4; ++yy) { - for (int_fast32_t x = 0; x < width; ++x) { - uvg_pixel ref1 = ref_main[x + delta_int[yy] + 1]; - uvg_pixel ref2 = ref_main[x + delta_int[yy] + 2]; - dst[(y + yy) * width + x] = ref1 + ((delta_fract[yy] * (ref2 - ref1) + 16) >> 5); - } - } - } - } - else { - // Just copy the integer samples - for (int yy = 0; yy < 4; ++yy) { - uvg_pixel* dst_row = dst + (y + yy) * width; - uvg_pixel* ref_row = ref_main + delta_int[yy] + 1; - for (int_fast32_t x = 0; x < width; x += 4) { - memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); - } - } - } - } + // No interpolation or filtering needed, just copy the integer samples + angular_pred_avx2_non_fractional_angle_pxl_copy(dst, ref_main, width, height, delta_int); } } else { @@ -1014,9 +979,8 @@ static void uvg_angular_pred_avx2( } // PDPC for non-horizontal and non-vertical modes - if (!(pred_mode == 18 || pred_mode == 50)) { - bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); + bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH); if (pred_mode > 1 && pred_mode < 67) { // Disable PDPC filter if both references are used or if MRL is used if (mode_disp < 0 || multi_ref_index) { @@ -1031,40 +995,6 @@ static void uvg_angular_pred_avx2( angular_pdpc_avx2(dst, ref_side, width, height, scale, mode_disp, modedisp2invsampledisp[abs(mode_disp)]); } } - - // TODO: to get rid of this transpose, do a separate implementation for horizontal and vertical modes - // Flip the block if this is was a horizontal mode. - if (!vertical_mode) { - - const __m128i vtranspose_mask = _mm_setr_epi8( - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 - ); - - const __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - const __m128i vidx = _mm_slli_epi32(vseq, log2_width); - - // Brute force transpose, works with all block sizes - uvg_pixel tmp[PRED_BUF_SIZE]; - memcpy(tmp, dst, (sizeof(uvg_pixel) * (width * height))); - - if (width == height) { - for (int_fast32_t y = 0; y < height - 1; ++y) { - for (int_fast32_t x = y + 1; x < width; ++x) { - SWAP(dst[y * height + x], dst[x * width + y], uvg_pixel); - } - } - } - else { - for (int y = 0; y < width; ++y) { - for (int x = 0; x < height; ++x) { - dst[x + y * height] = tmp[y + x * width]; - } - } - } - } } From 28ca1b0bcab6c59f0fe44485f9fdc7afe9e0528c Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 6 Oct 2023 16:49:36 +0300 Subject: [PATCH 026/237] Add delta_int and delta_fract tables. Placeholder for 16 width intra avx2. --- src/strategies/avx2/intra-avx2.c | 143 +++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a4acfee5..af73d46f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -154,6 +154,143 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // offset 64, line == 1, this might not be needed, ever }; +// Delta int and delta fract tables. Rows are prediction mode, columns y offset. +ALIGNED(32) static const int64_t delta_int_table[2112] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // 2 Diagonal mode + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, // 6 + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, // 10 + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, // 14 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, // 22 + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, // 26 + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, // 30 + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, // 34 Diagonal mode + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, // 38 + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, // 42 + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, // 46 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, // 54 + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, // 58 + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, // 62 + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // 66 Diagonal mode +}; + +ALIGNED(32) static const int32_t delta_fract_table[2212] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 + 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, + 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 38 + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 42 + 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, + 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 46 + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 54 + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 58 + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 62 + 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 66 Diagonal mode +}; + /** * \brief Generate angular predictions. * \param cu_loc CU locationand size data. @@ -713,6 +850,12 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } +static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int64_t* delta_int, const int32_t* delta_fract, const int height, const int use_cubic) +{ + const int width = 16; +} + + // TODO: vectorize static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int64_t* delta_int, const int32_t* delta_fract) { From e579f2aaae1a1d57c34c70815d7b55bdd64d8e68 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 25 Oct 2023 11:30:13 +0300 Subject: [PATCH 027/237] Use premade delta tables instead of computing. --- src/strategies/avx2/intra-avx2.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index af73d46f..ea8810b8 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -719,6 +719,7 @@ static void uvg_angular_pred_avx2_old( // TODO: vectorize +// TODO: obsolete, remove all usage static void angular_pred_avx2_delta_tables(int64_t* delta_int, int32_t* delta_fract, const int line, const int mrl, const int sample_disp) { for (int i = 0, delta_pos = sample_disp * (1 + mrl); i < line; ++i, delta_pos += sample_disp) { @@ -1054,11 +1055,14 @@ static void uvg_angular_pred_avx2( if (sample_disp != 0) { // The mode is not horizontal or vertical, we have to do interpolation. - int64_t delta_int[MAX_PRED_WIDTH]; - int32_t delta_fract[MAX_PRED_WIDTH]; + + // Set delta table pointers + int mode_offset = (pred_mode - 2) * 32; + const int64_t* delta_int = &delta_int_table[mode_offset]; + const int32_t* delta_fract = &delta_fract_table[mode_offset]; // TODO: for horizontal modes, these should be constructed using width instead of height - angular_pred_avx2_delta_tables(delta_int, delta_fract, height, multi_ref_index, sample_disp); + //angular_pred_avx2_delta_tables(delta_int, delta_fract, height, multi_ref_index, sample_disp); // Check if the angle is fractional. If yes, interpolation is needed if ((abs(sample_disp) & 0x1F) != 0) { @@ -1069,7 +1073,7 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_avx2_w4_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 8: break; - case 16: break; + case 16: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 32: break; case 64: break; default: From f3e18c8da613b148adb57c1fe9a7a76c7c5ecead Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 25 Oct 2023 13:15:08 +0300 Subject: [PATCH 028/237] WIP Implement 16 width intra avx2. --- src/strategies/avx2/intra-avx2.c | 115 ++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index ea8810b8..2931c698 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -154,8 +154,9 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // offset 64, line == 1, this might not be needed, ever }; +// TODO: extend delta tables to hold 64 y offsets when 64x64 prediction is supported. // Delta int and delta fract tables. Rows are prediction mode, columns y offset. -ALIGNED(32) static const int64_t delta_int_table[2112] = { +ALIGNED(32) static const int16_t delta_int_table[2112] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // 2 Diagonal mode 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, @@ -223,7 +224,7 @@ ALIGNED(32) static const int64_t delta_int_table[2112] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // 66 Diagonal mode }; -ALIGNED(32) static const int32_t delta_fract_table[2212] = { +ALIGNED(32) static const int16_t delta_fract_table[2212] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, @@ -766,7 +767,7 @@ static const int16_t cubic_filter[32][4] = }; -static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int64_t* delta_int, const int32_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 4; @@ -801,8 +802,7 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, int16_t f[4][4] = { { 0 } }; // For a 4 width block, height must be at least 4. Handle 4 lines at once - for (int y = 0; y < height; y += 4) - { + for (int y = 0; y < height; y += 4) { if (use_cubic) { memcpy(f[0], cubic_filter[delta_fract[y + 0]], 8); memcpy(f[1], cubic_filter[delta_fract[y + 1]], 8); @@ -821,7 +821,12 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, // Do 4-tap intra interpolation filtering uvg_pixel* p = (uvg_pixel*)ref_main; - __m256i vidx = _mm256_loadu_si256((__m256i*)&delta_int[y]); + // This solution assumes the delta int values to be 64-bit + // Cast from 16-bit to 64-bit. + __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], + delta_int[y + 1], + delta_int[y + 2], + delta_int[y + 3]); __m256i all_weights = _mm256_loadu_si256((__m256i*)f); __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); @@ -851,14 +856,102 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } -static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int64_t* delta_int, const int32_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 16; + + const __m256i p_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + + const __m256i p_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e + ); + + const __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + ); + + const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + ); + + //int16_t f[4][4] = { { 0 } }; + + // For a 16 width block, height can be 1. + for (int y = 0; y < height; ++y) { + __m256i all_weights; + if (use_cubic) { + //memcpy(f[0], cubic_filter[delta_fract[y + 0]], 8); + //memcpy(f[1], cubic_filter[delta_fract[y + 1]], 8); + //memcpy(f[2], cubic_filter[delta_fract[y + 2]], 8); + //memcpy(f[3], cubic_filter[delta_fract[y + 3]], 8); + int64_t *tmp = (int64_t*)&delta_fract[y]; + all_weights = _mm256_set1_epi64x(*tmp); + } + else { + for (int yy = 0; yy < 4; ++yy) { + const int16_t offset = (delta_fract[y + yy] >> 1); + int16_t tmp[4]; + tmp[0] = 16 - offset; + tmp[1] = 32 - offset; + tmp[2] = 16 + offset; + tmp[3] = offset; + all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); + } + } + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)ref_main; + // This solution assumes the delta int values to be 64-bit + // Cast from 16-bit to 64-bit. + __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], + delta_int[y + 1], + delta_int[y + 2], + delta_int[y + 3]); + //__m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + } + } } // TODO: vectorize -static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int64_t* delta_int, const int32_t* delta_fract) +static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { @@ -871,7 +964,7 @@ static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, cons // TODO: vectorize -static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int64_t* delta_int) +static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { for (int y = 0; y < height; ++y) { uvg_pixel* dst_row = dst + y * width; @@ -1058,8 +1151,8 @@ static void uvg_angular_pred_avx2( // Set delta table pointers int mode_offset = (pred_mode - 2) * 32; - const int64_t* delta_int = &delta_int_table[mode_offset]; - const int32_t* delta_fract = &delta_fract_table[mode_offset]; + const int16_t* delta_int = &delta_int_table[mode_offset]; + const int16_t* delta_fract = &delta_fract_table[mode_offset]; // TODO: for horizontal modes, these should be constructed using width instead of height //angular_pred_avx2_delta_tables(delta_int, delta_fract, height, multi_ref_index, sample_disp); From 62382850c115456f9bf75fe2139fc0cd7d77ec76 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 6 Nov 2023 15:43:08 +0200 Subject: [PATCH 029/237] Add PDPC filter for horizontal modes. --- src/strategies/avx2/intra-avx2.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 2931c698..1298cb5c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -969,14 +969,14 @@ static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_ for (int y = 0; y < height; ++y) { uvg_pixel* dst_row = dst + y * width; uvg_pixel* ref_row = ref + delta_int[y] + 1; - for (int_fast32_t x = 0; x + 3 < width; x += 4) { + for (int_fast32_t x = 0; x < width; x += 4) { memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); } } } -static void angular_pdpc_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int_fast8_t mode_disp, const int16_t inv_sample_disp) +static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { int16_t wL[4]; int16_t left[4][4]; @@ -1031,6 +1031,22 @@ static void angular_pdpc_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const i } } + +static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +{ + // TODO: PDPC for horizontal modes + + for (int y = 0; y < height; ++y) { + int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; + int16_t wT = 32 >> ((y << 1) >> scale); + for (int x = 0; x < width; ++x) { + int16_t top = ref_side[x + (inv_angle_sum >> 9) + 1]; + dst[y * width + x] = CLIP_TO_PIXEL((top * wT + (64 - wT) * dst[y * width + x] + 32) >> 6); + } + } +} + + static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, @@ -1068,9 +1084,9 @@ static void uvg_angular_pred_avx2( // Whether to swap references to always project on the left reference row. const bool vertical_mode = intra_mode >= 34; - // Modes distance to horizontal or vertical mode. + // Modes distance to horizontal or vertical mode. Possible values: [-16, 16] + // For pure vertical or horizontal modes, this is 0. For pure diagonal modes, this is either -16 or 16. const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -(pred_mode - 18); - //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode; // Sample displacement per column in fractions of 32. const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; @@ -1232,8 +1248,11 @@ static void uvg_angular_pred_avx2( } } if (PDPC_filter) { - angular_pdpc_avx2(dst, ref_side, width, height, scale, mode_disp, modedisp2invsampledisp[abs(mode_disp)]); - } + if (vertical_mode) + angular_pdpc_ver_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); + else + angular_pdpc_hor_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); + } } } From 153a6dd3c206e7040a13118723313d52cb480df8 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 7 Nov 2023 14:30:02 +0200 Subject: [PATCH 030/237] Fix horizontal mode pdpc. The dimension which caused errors had a limit value in generic solution, no idea where it comes from. It is not defined in the jvet document. --- src/strategies/avx2/intra-avx2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1298cb5c..7da1da50 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1035,8 +1035,9 @@ static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { // TODO: PDPC for horizontal modes - - for (int y = 0; y < height; ++y) { + int limit = MIN(3 << scale, height); + + for (int y = 0; y < limit; ++y) { int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; int16_t wT = 32 >> ((y << 1) >> scale); for (int x = 0; x < width; ++x) { From 4f13c7f9a039c1aba352ab69ce173757c771c794 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 7 Nov 2023 15:19:55 +0200 Subject: [PATCH 031/237] Implement 16 width intra angular avx2. --- src/strategies/avx2/intra-avx2.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7da1da50..13ce4428 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -898,19 +898,20 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, //memcpy(f[1], cubic_filter[delta_fract[y + 1]], 8); //memcpy(f[2], cubic_filter[delta_fract[y + 2]], 8); //memcpy(f[3], cubic_filter[delta_fract[y + 3]], 8); - int64_t *tmp = (int64_t*)&delta_fract[y]; - all_weights = _mm256_set1_epi64x(*tmp); + //int64_t *tmp = (int64_t*)&delta_fract[y]; + int16_t tmp[4]; + memcpy(&tmp, cubic_filter[delta_fract[y]], 8); + all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); } else { - for (int yy = 0; yy < 4; ++yy) { - const int16_t offset = (delta_fract[y + yy] >> 1); - int16_t tmp[4]; - tmp[0] = 16 - offset; - tmp[1] = 32 - offset; - tmp[2] = 16 + offset; - tmp[3] = offset; - all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); - } + const int16_t offset = (delta_fract[y] >> 1); + int16_t tmp[4]; + tmp[0] = 16 - offset; + tmp[1] = 32 - offset; + tmp[2] = 16 + offset; + tmp[3] = offset; + all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); + } // Do 4-tap intra interpolation filtering @@ -1034,7 +1035,7 @@ static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { - // TODO: PDPC for horizontal modes + // TODO: PDPC for horizontal modes. Change this to AVX2 int limit = MIN(3 << scale, height); for (int y = 0; y < limit; ++y) { From 7ddde89555b35bc30d5c971e99bed41cb7d7cef3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 8 Nov 2023 15:55:42 +0200 Subject: [PATCH 032/237] Implement width 8 angular intra avx2. --- src/strategies/avx2/intra-avx2.c | 91 +++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 13ce4428..b2b78045 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -855,6 +855,94 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } } +static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +{ + const int width = 8; + + const __m256i p_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + + const __m256i p_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e + ); + + const __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + ); + + const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + ); + + // For a 8 width block, height must be at least 2. Handle 2 lines at once + for (int y = 0; y < height; y += 2) { + __m256i all_weights; + if (use_cubic) { + int16_t tmp[8]; + memcpy(&tmp[0], cubic_filter[delta_fract[y + 0]], 8); + memcpy(&tmp[4], cubic_filter[delta_fract[y + 1]], 8); + all_weights = _mm256_setr_epi64x(*(int64_t*)&tmp[0], *(int64_t*)&tmp[4], *(int64_t*)&tmp[0], *(int64_t*)&tmp[4]); + } + else { + int16_t tmp[8]; + for (int yy = 0; yy < 2; ++yy) { + const int16_t offset = (delta_fract[y + yy] >> 1); + const int idx = yy * 4; + tmp[idx + 0] = 16 - offset; + tmp[idx + 1] = 32 - offset; + tmp[idx + 2] = 16 + offset; + tmp[idx + 3] = offset; + } + all_weights = _mm256_setr_epi64x(*(int64_t*)&tmp[0], *(int64_t*)&tmp[4], *(int64_t*)&tmp[0], *(int64_t*)&tmp[4]); + } + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)ref_main; + // This solution assumes the delta int values to be 64-bit + // Cast from 16-bit to 64-bit. + __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], + delta_int[y + 1], + delta_int[y + 2], + delta_int[y + 3]); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + } + } +} static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { @@ -911,7 +999,6 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, tmp[2] = 16 + offset; tmp[3] = offset; all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); - } // Do 4-tap intra interpolation filtering @@ -1183,7 +1270,7 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: angular_pred_avx2_w4_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 8: break; + case 8: angular_pred_avx2_w8_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 16: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 32: break; case 64: break; From 9eda50d613fa3bf007ecf95cb3c0e638b7b79b3b Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 8 Nov 2023 17:22:35 +0200 Subject: [PATCH 033/237] Fix width 8 angular intra avx2. 16 width is still broken, same issue as width 8. --- src/strategies/avx2/intra-avx2.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b2b78045..3183a346 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -915,12 +915,12 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, // Cast from 16-bit to 64-bit. __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], delta_int[y + 1], - delta_int[y + 2], - delta_int[y + 3]); + delta_int[y + 0] + 4, + delta_int[y + 1] + 4); __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + for (int_fast32_t x = 0; x < width; x += 8, p += 8) { __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); @@ -936,18 +936,16 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + *(uint32_t*)(dst + (y + 0) * width + (x + 0)) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + (x + 0)) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 0) * width + (x + 4)) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 1) * width + (x + 4)) = _mm_extract_epi32(filtered, 3); } } } -static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) { - const int width = 16; - const __m256i p_shuf_01 = _mm256_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, @@ -1013,7 +1011,7 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + for (int_fast32_t x = 0; x < width; x += 4, p += 4) { __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); @@ -1030,9 +1028,6 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i filtered = _mm_packus_epi16(lo, hi); *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); } } } @@ -1271,9 +1266,9 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_avx2_w4_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 8: angular_pred_avx2_w8_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 16: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 32: break; - case 64: break; + case 16: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 32: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 64: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; From b935093c00cfaf0486036c167e9f730042d0a1bb Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 8 Nov 2023 17:30:56 +0200 Subject: [PATCH 034/237] Fix width 16 angular intra avx2. Width 32 and 64 also work with this function. --- src/strategies/avx2/intra-avx2.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3183a346..daa7d289 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1003,15 +1003,15 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, uvg_pixel* p = (uvg_pixel*)ref_main; // This solution assumes the delta int values to be 64-bit // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], - delta_int[y + 1], - delta_int[y + 2], - delta_int[y + 3]); + __m256i vidx = _mm256_setr_epi64x(delta_int[y] + 0, + delta_int[y] + 4, + delta_int[y] + 8, + delta_int[y] + 12); //__m256i all_weights = _mm256_loadu_si256((__m256i*)f); __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - for (int_fast32_t x = 0; x < width; x += 4, p += 4) { + for (int_fast32_t x = 0; x < width; x += 16, p += 16) { __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); @@ -1027,7 +1027,10 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 0) * width + x + 0) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 0) * width + x + 4) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 0) * width + x + 8) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 0) * width + x + 12) = _mm_extract_epi32(filtered, 3); } } } From f99c6a630c5f85e1106d75fce47547154fa85e8a Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 9 Nov 2023 12:52:53 +0200 Subject: [PATCH 035/237] Extend delta tables for 64 height predictions. --- src/strategies/avx2/intra-avx2.c | 268 +++++++++++++++---------------- 1 file changed, 134 insertions(+), 134 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index daa7d289..0b307f3e 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -156,140 +156,140 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { // TODO: extend delta tables to hold 64 y offsets when 64x64 prediction is supported. // Delta int and delta fract tables. Rows are prediction mode, columns y offset. -ALIGNED(32) static const int16_t delta_int_table[2112] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // 2 Diagonal mode - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, // 6 - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, // 10 - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, // 14 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, // 22 - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, // 26 - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, // 30 - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, // 34 Diagonal mode - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, // 38 - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, // 42 - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, // 46 - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, // 54 - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, // 58 - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, // 62 - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // 66 Diagonal mode +ALIGNED(32) static const int16_t delta_int_table[4160] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 6 + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 10 + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 14 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 22 + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 26 + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 30 + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 38 + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 42 + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 46 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 54 + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 58 + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 62 + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 66 }; -ALIGNED(32) static const int16_t delta_fract_table[2212] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 - 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, - 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 38 - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 42 - 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, - 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 46 - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 54 - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, - 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 58 - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 62 - 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 66 Diagonal mode +ALIGNED(32) static const int16_t delta_fract_table[4160] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 + 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, + 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 38 + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 42 + 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, + 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 46 + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 54 + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 58 + 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, + 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 62 + 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, + 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, + 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 66 }; /** @@ -1118,9 +1118,9 @@ static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con } +// TODO: vectorize static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { - // TODO: PDPC for horizontal modes. Change this to AVX2 int limit = MIN(3 << scale, height); for (int y = 0; y < limit; ++y) { @@ -1253,7 +1253,7 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - int mode_offset = (pred_mode - 2) * 32; + int mode_offset = (pred_mode - 2) * 64; const int16_t* delta_int = &delta_int_table[mode_offset]; const int16_t* delta_fract = &delta_fract_table[mode_offset]; From 4210788943e85b3f491c6fae086d36813bfb3615 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 9 Nov 2023 14:13:55 +0200 Subject: [PATCH 036/237] Add comments. --- src/strategies/avx2/intra-avx2.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0b307f3e..aab93958 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -155,9 +155,9 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { }; // TODO: extend delta tables to hold 64 y offsets when 64x64 prediction is supported. -// Delta int and delta fract tables. Rows are prediction mode, columns y offset. +// Delta int and delta fract tables. Rows are prediction mode, columns y offset. (or x offset for horizontal modes) ALIGNED(32) static const int16_t delta_int_table[4160] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 Diagonal mode 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, @@ -173,7 +173,7 @@ ALIGNED(32) static const int16_t delta_int_table[4160] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, @@ -189,7 +189,7 @@ ALIGNED(32) static const int16_t delta_int_table[4160] = { -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 Diagonal mode -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, @@ -205,7 +205,7 @@ ALIGNED(32) static const int16_t delta_int_table[4160] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, @@ -221,11 +221,11 @@ ALIGNED(32) static const int16_t delta_int_table[4160] = { 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 66 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 66 Diagonal mode }; ALIGNED(32) static const int16_t delta_fract_table[4160] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, @@ -241,7 +241,7 @@ ALIGNED(32) static const int16_t delta_fract_table[4160] = { 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, @@ -257,7 +257,7 @@ ALIGNED(32) static const int16_t delta_fract_table[4160] = { 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, @@ -273,7 +273,7 @@ ALIGNED(32) static const int16_t delta_fract_table[4160] = { 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, @@ -289,7 +289,7 @@ ALIGNED(32) static const int16_t delta_fract_table[4160] = { 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 66 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 66 Diagonal mode }; /** From 1a2050681671989114e91a4e575ff3f979e386b7 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 10 Nov 2023 14:41:27 +0200 Subject: [PATCH 037/237] Replace gather with load in 16 width intra angular avx2. Tweak the shuffle vectors. --- src/strategies/avx2/intra-avx2.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index aab93958..bb61e44f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -948,16 +948,16 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, { const __m256i p_shuf_01 = _mm256_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); const __m256i p_shuf_23 = _mm256_setr_epi8( 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e + 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a ); const __m256i w_shuf_01 = _mm256_setr_epi8( @@ -1007,15 +1007,19 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, delta_int[y] + 4, delta_int[y] + 8, delta_int[y] + 12); + //__m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); for (int_fast32_t x = 0; x < width; x += 16, p += 16) { + __m256i vp = _mm256_loadu_si256((__m256i*)(p + delta_int[y])); - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + __m256i tmp = _mm256_permute4x64_epi64(vp, _MM_SHUFFLE(2, 1, 1, 0)); + + __m256i vp_01 = _mm256_shuffle_epi8(tmp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(tmp, p_shuf_23); __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); @@ -1027,10 +1031,8 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - *(uint32_t*)(dst + (y + 0) * width + x + 0) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 0) * width + x + 4) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 0) * width + x + 8) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 0) * width + x + 12) = _mm_extract_epi32(filtered, 3); + _mm_store_si128((__m128i*)dst, filtered); + dst += 16; } } } From d6f86bdc8ecfba0c17c72d0c792e413a6b423e94 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 14 Nov 2023 14:59:08 +0200 Subject: [PATCH 038/237] Implement width 4 intra angular avx2 for horizontal modes. --- src/strategies/avx2/intra-avx2.c | 115 ++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index bb61e44f..d6deb01c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1038,6 +1038,119 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } +static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +{ + const int width = 4; + + const __m256i p_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x08, 0x09, 0x01, 0x02, 0x09, 0x0a, + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, + 0x00, 0x01, 0x08, 0x09, 0x01, 0x02, 0x09, 0x0a, + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c + ); + + const __m256i p_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, + 0x04, 0x05, 0x0c, 0x0d, 0x05, 0x06, 0x0d, 0x0e, + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, + 0x04, 0x05, 0x0c, 0x0d, 0x05, 0x06, 0x0d, 0x0e + ); + + const __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a + ); + + //const __m256i w_shuf_01 = _mm256_setr_epi8( + // 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + // 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, + // 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + // 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + //); + + const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e + ); + + /*const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, + 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + );*/ + + const __m128i r_shuffle = _mm_setr_epi8( + 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, + 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f + ); + + int16_t f[4][4] = { { 0 } }; + + // For a 4 width block, height must be at least 4. Handle 4 lines at once + for (int y = 0; y < height; y += 4) { + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)(ref_main + y); + + for (int_fast32_t x = 0; x < width; x += 4, p += 4) { + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[x + 0]], 8); + memcpy(f[1], cubic_filter[delta_fract[x + 1]], 8); + memcpy(f[2], cubic_filter[delta_fract[x + 2]], 8); + memcpy(f[3], cubic_filter[delta_fract[x + 3]], 8); + } + else { + for (int xx = 0; xx < 4; ++xx) { + const int16_t offset = (delta_fract[x + xx] >> 1); + f[xx][0] = 16 - offset; + f[xx][1] = 32 - offset; + f[xx][2] = 16 + offset; + f[xx][3] = offset; + } + } + + // This solution assumes the delta int values to be 64-bit + // Cast from 16-bit to 64-bit. + __m256i vidx = _mm256_setr_epi64x(delta_int[x + 0], + delta_int[x + 1], + delta_int[x + 2], + delta_int[x + 3]); + __m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); + //__m256i vp = _mm256_loadu_si256((__m256i*)(p + delta_int[y])); + + //__m256i tmp = _mm256_permute4x64_epi64(vp, _MM_SHUFFLE(2, 1, 1, 0)); + + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + + + _mm_store_si128((__m128i*)dst, _mm_shuffle_epi8(filtered, r_shuffle)); + dst += 16; + } + } +} + + // TODO: vectorize static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { @@ -1281,7 +1394,7 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: break; + case 4: angular_pred_avx2_w4_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 8: break; case 16: break; case 32: break; From 3955b1dc0a1d7801c4b4cedf699faf66196d498b Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 15 Nov 2023 12:17:43 +0200 Subject: [PATCH 039/237] WIP width 8 angular intra av2 horizontal. --- src/strategies/avx2/intra-avx2.c | 118 ++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d6deb01c..149c5fb5 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1063,13 +1063,6 @@ static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a ); - //const __m256i w_shuf_01 = _mm256_setr_epi8( - // 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - // 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - // 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - // 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a - //); - const __m256i w_shuf_23 = _mm256_setr_epi8( 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, @@ -1077,13 +1070,6 @@ static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e ); - /*const __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e - );*/ - const __m128i r_shuffle = _mm_setr_epi8( 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f @@ -1097,7 +1083,7 @@ static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, // Do 4-tap intra interpolation filtering uvg_pixel* p = (uvg_pixel*)(ref_main + y); - for (int_fast32_t x = 0; x < width; x += 4, p += 4) { + for (int_fast32_t x = 0; x < width; x += 4) { if (use_cubic) { memcpy(f[0], cubic_filter[delta_fract[x + 0]], 8); memcpy(f[1], cubic_filter[delta_fract[x + 1]], 8); @@ -1142,14 +1128,110 @@ static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_store_si128((__m128i*)dst, _mm_shuffle_epi8(filtered, r_shuffle)); dst += 16; } } } +static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +{ + const int width = 8; + + const __m256i p_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x08, 0x09, 0x01, 0x02, 0x09, 0x0a, + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, + 0x00, 0x01, 0x08, 0x09, 0x01, 0x02, 0x09, 0x0a, + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c + ); + + const __m256i p_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, + 0x04, 0x05, 0x0c, 0x0d, 0x05, 0x06, 0x0d, 0x0e, + 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, + 0x04, 0x05, 0x0c, 0x0d, 0x05, 0x06, 0x0d, 0x0e + ); + + const __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, + 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a + ); + + const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, + 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e + ); + + const __m128i r_shuffle = _mm_setr_epi8( + 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, + 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f + ); + + int16_t f[8][4] = { { 0 } }; + + // For a 8 width block, height must be at least 2. Handle 2 lines at once + for (int y = 0; y < height; y += 2) { + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)(ref_main + y); + + for (int_fast32_t x = 0; x < width; x += 8) { + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[x + 0]], 8); + memcpy(f[1], cubic_filter[delta_fract[x + 1]], 8); + memcpy(f[2], cubic_filter[delta_fract[x + 2]], 8); + memcpy(f[3], cubic_filter[delta_fract[x + 3]], 8); + memcpy(f[4], cubic_filter[delta_fract[x + 4]], 8); + memcpy(f[5], cubic_filter[delta_fract[x + 5]], 8); + memcpy(f[6], cubic_filter[delta_fract[x + 6]], 8); + memcpy(f[7], cubic_filter[delta_fract[x + 7]], 8); + } + else { + for (int xx = 0; xx < 8; ++xx) { + const int16_t offset = (delta_fract[x + xx] >> 1); + f[xx][0] = 16 - offset; + f[xx][1] = 32 - offset; + f[xx][2] = 16 + offset; + f[xx][3] = offset; + } + } + + // This solution assumes the delta int values to be 64-bit + // Cast from 16-bit to 64-bit. + __m256i vidx = _mm256_setr_epi32(delta_int[x + 0], delta_int[x + 1], + delta_int[x + 2], delta_int[x + 3], + delta_int[x + 4], delta_int[x + 5], + delta_int[x + 6], delta_int[x + 7]); + __m256i all_weights = _mm256_loadu_si256((__m256i*)f); + __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + + __m256i vp = _mm256_i32gather_epi32((const int*)p, vidx, 1); + + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)dst, _mm_shuffle_epi8(filtered, r_shuffle)); + + dst += 16; + } + } +} + // TODO: vectorize static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) @@ -1395,7 +1477,7 @@ static void uvg_angular_pred_avx2( else { switch (width) { case 4: angular_pred_avx2_w4_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 8: break; + case 8: angular_pred_avx2_w8_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 16: break; case 32: break; case 64: break; From 5190047d5697ff0d77406476d89b08cdedf5f531 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 15 Nov 2023 17:15:16 +0200 Subject: [PATCH 040/237] Add wide angle correction. Update asserts to include wide angle modes. --- src/strategies/avx2/intra-avx2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 149c5fb5..c23b2ed1 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -914,7 +914,7 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, // This solution assumes the delta int values to be 64-bit // Cast from 16-bit to 64-bit. __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], - delta_int[y + 1], + delta_int[y + 1], // TODO: flip these middle ones, then replace gather with 128-bit load. Replace extract with store. Also, fix shuffle vectors. delta_int[y + 0] + 4, delta_int[y + 1] + 4); __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); @@ -1350,7 +1350,8 @@ static void uvg_angular_pred_avx2( const int log2_height = uvg_g_convert_to_log2[height]; assert((log2_width >= 2 && log2_width <= 6) && (log2_height >= 0 && log2_height <= 6)); - assert(intra_mode >= 2 && intra_mode <= 66); + // Modes [-1, -14] and [67, 80] are wide angle modes + assert(intra_mode >= -14 && intra_mode <= 80); uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; uint8_t isp = isp_mode; From 0470c180f153d15160b82c1891cfe295a7590db2 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 16 Nov 2023 16:40:53 +0200 Subject: [PATCH 041/237] Implement wide angle modes. Change the way delta int and delta fract tables are indexed. Remove symmetrical duplicate lines from tables. Extend tables to support wide angle modes. Add negative delta int table. --- src/strategies/avx2/intra-avx2.c | 239 ++++++++++++++----------------- 1 file changed, 104 insertions(+), 135 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c23b2ed1..425ee8f8 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -156,140 +156,109 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { // TODO: extend delta tables to hold 64 y offsets when 64x64 prediction is supported. // Delta int and delta fract tables. Rows are prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_table[4160] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 Diagonal mode - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 6 - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 10 - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 14 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 22 - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 26 - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 30 - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 Diagonal mode - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 38 - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 42 - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 46 - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 54 - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 58 - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 62 - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 66 Diagonal mode +ALIGNED(32) static const int16_t delta_int_table[2048] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Horizontal and vertical mode + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // +4 + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // +8 + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // +12 + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // Diagonal mode. Wide angle modes below + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, + 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, + 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, + 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, // +20 + 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, + 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, + 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, // +24 + 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, + 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, + 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, + 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // +28 + 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, + 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, + 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, }; -ALIGNED(32) static const int16_t delta_fract_table[4160] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 - 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal mode - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, - 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 38 - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 42 - 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, - 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 46 - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50 Pure vertical mode - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 54 - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, - 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, - 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 58 - 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, - 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 62 - 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, - 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, - 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 66 Diagonal mode +ALIGNED(32) static const int16_t delta_int_neg_table[2048] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Horizontal and vertical mode + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // +4 + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // +8 + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // +12 + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // Diagonal mode. Wide angle modes below + -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -60, -61, -62, -63, -64, -65, -66, -67, -68, -69, -70, + -2, -3, -4, -5, -7, -8, -9, -10, -11, -13, -14, -15, -16, -18, -19, -20, -21, -22, -24, -25, -26, -27, -29, -30, -31, -32, -33, -35, -36, -37, -38, -39, -41, -42, -43, -44, -46, -47, -48, -49, -50, -52, -53, -54, -55, -57, -58, -59, -60, -61, -63, -64, -65, -66, -68, -69, -70, -71, -72, -74, -75, -76, -77, -78, + -2, -3, -5, -6, -8, -9, -10, -12, -13, -15, -16, -17, -19, -20, -22, -23, -24, -26, -27, -29, -30, -31, -33, -34, -36, -37, -38, -40, -41, -43, -44, -45, -47, -48, -50, -51, -53, -54, -55, -57, -58, -60, -61, -62, -64, -65, -67, -68, -69, -71, -72, -74, -75, -76, -78, -79, -81, -82, -83, -85, -86, -88, -89, -90, + -2, -4, -5, -7, -8, -10, -12, -13, -15, -16, -18, -20, -21, -23, -24, -26, -28, -29, -31, -32, -34, -36, -37, -39, -40, -42, -44, -45, -47, -48, -50, -51, -53, -55, -56, -58, -59, -61, -63, -64, -66, -67, -69, -71, -72, -74, -75, -77, -79, -80, -82, -83, -85, -87, -88, -90, -91, -93, -95, -96, -98, -99, -101, -102, // +20 + -2, -4, -6, -8, -9, -11, -13, -15, -17, -18, -20, -22, -24, -25, -27, -29, -31, -33, -34, -36, -38, -40, -41, -43, -45, -47, -49, -50, -52, -54, -56, -57, -59, -61, -63, -65, -66, -68, -70, -72, -74, -75, -77, -79, -81, -82, -84, -86, -88, -90, -91, -93, -95, -97, -98, -100, -102, -104, -106, -107, -109, -111, -113, -114, + -2, -4, -6, -8, -10, -12, -14, -16, -18, -20, -22, -24, -26, -28, -30, -32, -34, -36, -38, -40, -42, -44, -46, -48, -50, -52, -54, -56, -58, -60, -62, -64, -66, -68, -70, -72, -74, -76, -78, -80, -82, -84, -86, -88, -90, -92, -94, -96, -98, -100, -102, -104, -106, -108, -110, -112, -114, -116, -118, -120, -122, -124, -126, -128, + -3, -5, -7, -10, -12, -14, -16, -19, -21, -23, -26, -28, -30, -32, -35, -37, -39, -42, -44, -46, -48, -51, -53, -55, -58, -60, -62, -64, -67, -69, -71, -73, -76, -78, -80, -83, -85, -87, -89, -92, -94, -96, -99, -101, -103, -105, -108, -110, -112, -115, -117, -119, -121, -124, -126, -128, -131, -133, -135, -137, -140, -142, -144, -146, + -3, -6, -9, -11, -14, -17, -19, -22, -25, -27, -30, -33, -35, -38, -41, -43, -46, -49, -52, -54, -57, -60, -62, -65, -68, -70, -73, -76, -78, -81, -84, -86, -89, -92, -95, -97, -100, -103, -105, -108, -111, -113, -116, -119, -121, -124, -127, -129, -132, -135, -138, -140, -143, -146, -148, -151, -154, -156, -159, -162, -164, -167, -170, -172, // +24 + -4, -7, -10, -13, -16, -20, -23, -26, -29, -32, -36, -39, -42, -45, -48, -51, -55, -58, -61, -64, -67, -71, -74, -77, -80, -83, -87, -90, -93, -96, -99, -102, -106, -109, -112, -115, -118, -122, -125, -128, -131, -134, -138, -141, -144, -147, -150, -153, -157, -160, -163, -166, -169, -173, -176, -179, -182, -185, -189, -192, -195, -198, -201, -204, + -4, -8, -12, -16, -20, -24, -28, -32, -36, -40, -44, -48, -52, -56, -60, -64, -68, -72, -76, -80, -84, -88, -92, -96, -100, -104, -108, -112, -116, -120, -124, -128, -132, -136, -140, -144, -148, -152, -156, -160, -164, -168, -172, -176, -180, -184, -188, -192, -196, -200, -204, -208, -212, -216, -220, -224, -228, -232, -236, -240, -244, -248, -252, -256, + -6, -11, -17, -22, -27, -33, -38, -43, -49, -54, -59, -65, -70, -75, -81, -86, -91, -97, -102, -107, -113, -118, -123, -129, -134, -139, -145, -150, -155, -161, -166, -171, -177, -182, -188, -193, -198, -204, -209, -214, -220, -225, -230, -236, -241, -246, -252, -257, -262, -268, -273, -278, -284, -289, -294, -300, -305, -310, -316, -321, -326, -332, -337, -342, + -8, -16, -24, -32, -40, -48, -56, -64, -72, -80, -88, -96, -104, -112, -120, -128, -136, -144, -152, -160, -168, -176, -184, -192, -200, -208, -216, -224, -232, -240, -248, -256, -264, -272, -280, -288, -296, -304, -312, -320, -328, -336, -344, -352, -360, -368, -376, -384, -392, -400, -408, -416, -424, -432, -440, -448, -456, -464, -472, -480, -488, -496, -504, -512, // +28 + -11, -22, -32, -43, -54, -64, -75, -86, -96, -107, -118, -128, -139, -150, -160, -171, -182, -192, -203, -214, -224, -235, -246, -256, -267, -278, -288, -299, -310, -320, -331, -341, -352, -363, -373, -384, -395, -405, -416, -427, -437, -448, -459, -469, -480, -491, -501, -512, -523, -533, -544, -555, -565, -576, -587, -597, -608, -619, -629, -640, -651, -661, -672, -682, + -16, -32, -48, -64, -80, -96, -112, -128, -144, -160, -176, -192, -208, -224, -240, -256, -272, -288, -304, -320, -336, -352, -368, -384, -400, -416, -432, -448, -464, -480, -496, -512, -528, -544, -560, -576, -592, -608, -624, -640, -656, -672, -688, -704, -720, -736, -752, -768, -784, -800, -816, -832, -848, -864, -880, -896, -912, -928, -944, -960, -976, -992, -1008, -1024, + -32, -64, -96, -128, -160, -192, -224, -256, -288, -320, -352, -384, -416, -448, -480, -512, -544, -576, -608, -640, -672, -704, -736, -768, -800, -832, -864, -896, -928, -960, -992, -1024, -1056, -1088, -1120, -1152, -1184, -1216, -1248, -1280, -1312, -1344, -1376, -1408, -1440, -1472, -1504, -1536, -1568, -1600, -1632, -1664, -1696, -1728, -1760, -1792, -1824, -1856, -1888, -1920, -1952, -1984, -2016, -2048, +}; + +ALIGNED(32) static const int16_t delta_fract_table[2048] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Horizontal and vertical mode + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // +4 + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, +10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // +8 +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // +12 +23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Diagonal mode. Wide angle modes below + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, +13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, +19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, // +20 +25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // +24 + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // +28 +21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /** @@ -1451,9 +1420,9 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - int mode_offset = (pred_mode - 2) * 64; - const int16_t* delta_int = &delta_int_table[mode_offset]; - const int16_t* delta_fract = &delta_fract_table[mode_offset]; + const int table_offset = abs(mode_disp) * 64; + const int16_t* delta_int = mode_disp >= 0 ? &delta_int_table[table_offset] : &delta_int_neg_table[table_offset]; + const int16_t* delta_fract = &delta_fract_table[table_offset]; // TODO: for horizontal modes, these should be constructed using width instead of height //angular_pred_avx2_delta_tables(delta_int, delta_fract, height, multi_ref_index, sample_disp); From 36eba27172808e96bf117daf84209955d9844e73 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 22 Nov 2023 16:25:43 +0200 Subject: [PATCH 042/237] Rework delta int & delta fract tables. Indexing is now based on pred mode instead of sample disp. Add tables for wide angle modes. Wide angle indexing also based on pred mode. --- src/strategies/avx2/intra-avx2.c | 233 ++++++++++++++++++------------- 1 file changed, 135 insertions(+), 98 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 425ee8f8..72e1841d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -154,113 +154,151 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // offset 64, line == 1, this might not be needed, ever }; -// TODO: extend delta tables to hold 64 y offsets when 64x64 prediction is supported. + // Delta int and delta fract tables. Rows are prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_table[2048] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Horizontal and vertical mode - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // +4 - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // +8 - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // +12 - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // Diagonal mode. Wide angle modes below - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, - 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, - 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, - 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, // +20 +ALIGNED(32) static const int16_t delta_int_table[2112] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 Diagonal mode + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 6 + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 10 + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 14 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal and vertical mode + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 22 + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 26 + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 30 + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 Diagonal mode +}; + + +// TODO: cut this table to 32 width, the second 32 width half is identical to the first +ALIGNED(32) static const int16_t delta_fract_table[2112] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 +10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal & vertical mode +31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, +30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, +28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode +}; + + +// TODO: cut this table in half due to symmetry +// Delta int and delta fract wide angle tables. Rows are corrected prediction mode, columns y offset. (or x offset for horizontal modes) +ALIGNED(32) static const int16_t delta_int_wide_angle_table[1856] = { + 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, // -12 + 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, + 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // -10 + 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, + 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, // -8 + 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, + 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, // -6 + 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, // -4 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, + 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, // -2 + 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, + 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, // 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 67 + 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, + 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, // 69 + 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, + 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, // 71 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, - 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, - 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, // +24 - 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, + 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, // 73 + 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, + 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, // 75 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, - 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, - 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // +28 - 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, + 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, // 77 + 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, + 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, // 79 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, - 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, + 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, // 81 }; -ALIGNED(32) static const int16_t delta_int_neg_table[2048] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Horizontal and vertical mode - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // +4 - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // +8 - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // +12 - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // Diagonal mode. Wide angle modes below - -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -60, -61, -62, -63, -64, -65, -66, -67, -68, -69, -70, - -2, -3, -4, -5, -7, -8, -9, -10, -11, -13, -14, -15, -16, -18, -19, -20, -21, -22, -24, -25, -26, -27, -29, -30, -31, -32, -33, -35, -36, -37, -38, -39, -41, -42, -43, -44, -46, -47, -48, -49, -50, -52, -53, -54, -55, -57, -58, -59, -60, -61, -63, -64, -65, -66, -68, -69, -70, -71, -72, -74, -75, -76, -77, -78, - -2, -3, -5, -6, -8, -9, -10, -12, -13, -15, -16, -17, -19, -20, -22, -23, -24, -26, -27, -29, -30, -31, -33, -34, -36, -37, -38, -40, -41, -43, -44, -45, -47, -48, -50, -51, -53, -54, -55, -57, -58, -60, -61, -62, -64, -65, -67, -68, -69, -71, -72, -74, -75, -76, -78, -79, -81, -82, -83, -85, -86, -88, -89, -90, - -2, -4, -5, -7, -8, -10, -12, -13, -15, -16, -18, -20, -21, -23, -24, -26, -28, -29, -31, -32, -34, -36, -37, -39, -40, -42, -44, -45, -47, -48, -50, -51, -53, -55, -56, -58, -59, -61, -63, -64, -66, -67, -69, -71, -72, -74, -75, -77, -79, -80, -82, -83, -85, -87, -88, -90, -91, -93, -95, -96, -98, -99, -101, -102, // +20 - -2, -4, -6, -8, -9, -11, -13, -15, -17, -18, -20, -22, -24, -25, -27, -29, -31, -33, -34, -36, -38, -40, -41, -43, -45, -47, -49, -50, -52, -54, -56, -57, -59, -61, -63, -65, -66, -68, -70, -72, -74, -75, -77, -79, -81, -82, -84, -86, -88, -90, -91, -93, -95, -97, -98, -100, -102, -104, -106, -107, -109, -111, -113, -114, - -2, -4, -6, -8, -10, -12, -14, -16, -18, -20, -22, -24, -26, -28, -30, -32, -34, -36, -38, -40, -42, -44, -46, -48, -50, -52, -54, -56, -58, -60, -62, -64, -66, -68, -70, -72, -74, -76, -78, -80, -82, -84, -86, -88, -90, -92, -94, -96, -98, -100, -102, -104, -106, -108, -110, -112, -114, -116, -118, -120, -122, -124, -126, -128, - -3, -5, -7, -10, -12, -14, -16, -19, -21, -23, -26, -28, -30, -32, -35, -37, -39, -42, -44, -46, -48, -51, -53, -55, -58, -60, -62, -64, -67, -69, -71, -73, -76, -78, -80, -83, -85, -87, -89, -92, -94, -96, -99, -101, -103, -105, -108, -110, -112, -115, -117, -119, -121, -124, -126, -128, -131, -133, -135, -137, -140, -142, -144, -146, - -3, -6, -9, -11, -14, -17, -19, -22, -25, -27, -30, -33, -35, -38, -41, -43, -46, -49, -52, -54, -57, -60, -62, -65, -68, -70, -73, -76, -78, -81, -84, -86, -89, -92, -95, -97, -100, -103, -105, -108, -111, -113, -116, -119, -121, -124, -127, -129, -132, -135, -138, -140, -143, -146, -148, -151, -154, -156, -159, -162, -164, -167, -170, -172, // +24 - -4, -7, -10, -13, -16, -20, -23, -26, -29, -32, -36, -39, -42, -45, -48, -51, -55, -58, -61, -64, -67, -71, -74, -77, -80, -83, -87, -90, -93, -96, -99, -102, -106, -109, -112, -115, -118, -122, -125, -128, -131, -134, -138, -141, -144, -147, -150, -153, -157, -160, -163, -166, -169, -173, -176, -179, -182, -185, -189, -192, -195, -198, -201, -204, - -4, -8, -12, -16, -20, -24, -28, -32, -36, -40, -44, -48, -52, -56, -60, -64, -68, -72, -76, -80, -84, -88, -92, -96, -100, -104, -108, -112, -116, -120, -124, -128, -132, -136, -140, -144, -148, -152, -156, -160, -164, -168, -172, -176, -180, -184, -188, -192, -196, -200, -204, -208, -212, -216, -220, -224, -228, -232, -236, -240, -244, -248, -252, -256, - -6, -11, -17, -22, -27, -33, -38, -43, -49, -54, -59, -65, -70, -75, -81, -86, -91, -97, -102, -107, -113, -118, -123, -129, -134, -139, -145, -150, -155, -161, -166, -171, -177, -182, -188, -193, -198, -204, -209, -214, -220, -225, -230, -236, -241, -246, -252, -257, -262, -268, -273, -278, -284, -289, -294, -300, -305, -310, -316, -321, -326, -332, -337, -342, - -8, -16, -24, -32, -40, -48, -56, -64, -72, -80, -88, -96, -104, -112, -120, -128, -136, -144, -152, -160, -168, -176, -184, -192, -200, -208, -216, -224, -232, -240, -248, -256, -264, -272, -280, -288, -296, -304, -312, -320, -328, -336, -344, -352, -360, -368, -376, -384, -392, -400, -408, -416, -424, -432, -440, -448, -456, -464, -472, -480, -488, -496, -504, -512, // +28 - -11, -22, -32, -43, -54, -64, -75, -86, -96, -107, -118, -128, -139, -150, -160, -171, -182, -192, -203, -214, -224, -235, -246, -256, -267, -278, -288, -299, -310, -320, -331, -341, -352, -363, -373, -384, -395, -405, -416, -427, -437, -448, -459, -469, -480, -491, -501, -512, -523, -533, -544, -555, -565, -576, -587, -597, -608, -619, -629, -640, -651, -661, -672, -682, - -16, -32, -48, -64, -80, -96, -112, -128, -144, -160, -176, -192, -208, -224, -240, -256, -272, -288, -304, -320, -336, -352, -368, -384, -400, -416, -432, -448, -464, -480, -496, -512, -528, -544, -560, -576, -592, -608, -624, -640, -656, -672, -688, -704, -720, -736, -752, -768, -784, -800, -816, -832, -848, -864, -880, -896, -912, -928, -944, -960, -976, -992, -1008, -1024, - -32, -64, -96, -128, -160, -192, -224, -256, -288, -320, -352, -384, -416, -448, -480, -512, -544, -576, -608, -640, -672, -704, -736, -768, -800, -832, -864, -896, -928, -960, -992, -1024, -1056, -1088, -1120, -1152, -1184, -1216, -1248, -1280, -1312, -1344, -1376, -1408, -1440, -1472, -1504, -1536, -1568, -1600, -1632, -1664, -1696, -1728, -1760, -1792, -1824, -1856, -1888, -1920, -1952, -1984, -2016, -2048, -}; - -ALIGNED(32) static const int16_t delta_fract_table[2048] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Horizontal and vertical mode - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // +4 +// TODO: cut out the latter 32 entries due to symmetry +ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1856] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 +21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 +11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -8 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, -10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, -12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // +8 -14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, -16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, -18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, -20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // +12 -23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, -26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, -29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Diagonal mode. Wide angle modes below - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, -13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, -19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, // +20 +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // -6 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -4 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, +19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, // -2 +13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, + 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, // 0 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 1 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 67 + 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, +13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, // 69 +19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, +25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, // 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, -22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // +24 - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, // 73 +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, // 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // +28 -21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, +11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, // 77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, // 79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 81 }; + /** * \brief Generate angular predictions. * \param cu_loc CU locationand size data. @@ -1338,9 +1376,11 @@ static void uvg_angular_pred_avx2( // Whether to swap references to always project on the left reference row. const bool vertical_mode = intra_mode >= 34; + // Modes distance to horizontal or vertical mode. Possible values: [-16, 16] // For pure vertical or horizontal modes, this is 0. For pure diagonal modes, this is either -16 or 16. const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -(pred_mode - 18); + const bool wide_angle_mode = mode_disp > 16; // Sample displacement per column in fractions of 32. const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; @@ -1420,12 +1460,9 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - const int table_offset = abs(mode_disp) * 64; - const int16_t* delta_int = mode_disp >= 0 ? &delta_int_table[table_offset] : &delta_int_neg_table[table_offset]; - const int16_t* delta_fract = &delta_fract_table[table_offset]; - - // TODO: for horizontal modes, these should be constructed using width instead of height - //angular_pred_avx2_delta_tables(delta_int, delta_fract, height, multi_ref_index, sample_disp); + const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 12) * 64 : (67 + 14 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); + const int16_t* delta_int = wide_angle_mode ? &delta_int_wide_angle_table[table_offset] : &delta_int_table[table_offset]; + const int16_t* delta_fract = wide_angle_mode ? &delta_fract_wide_angle_table[table_offset] : &delta_fract_table[table_offset]; // Check if the angle is fractional. If yes, interpolation is needed if ((abs(sample_disp) & 0x1F) != 0) { From 40ff57864466cb5da782a690de0cacdbd405701b Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 24 Nov 2023 13:35:22 +0200 Subject: [PATCH 043/237] Implement width 8 angular intra avx2. Fix bug in wide angle table indexing. --- src/strategies/avx2/intra-avx2.c | 145 +++++++++++++++---------------- 1 file changed, 69 insertions(+), 76 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 72e1841d..8a55f3a6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -233,7 +233,7 @@ ALIGNED(32) static const int16_t delta_fract_table[2112] = { // TODO: cut this table in half due to symmetry // Delta int and delta fract wide angle tables. Rows are corrected prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_wide_angle_table[1856] = { +ALIGNED(32) static const int16_t delta_int_wide_angle_table[1792] = { 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, // -12 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // -10 @@ -262,11 +262,11 @@ ALIGNED(32) static const int16_t delta_int_wide_angle_table[1856] = { 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, // 79 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, - 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, // 81 + }; // TODO: cut out the latter 32 entries due to symmetry -ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1856] = { +ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1792] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 @@ -295,7 +295,6 @@ ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1856] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, // 79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 81 }; @@ -1145,97 +1144,91 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, { const int width = 8; - const __m256i p_shuf_01 = _mm256_setr_epi8( - 0x00, 0x01, 0x08, 0x09, 0x01, 0x02, 0x09, 0x0a, - 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, - 0x00, 0x01, 0x08, 0x09, 0x01, 0x02, 0x09, 0x0a, - 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c + const __m256i w_shuf = _mm256_setr_epi8( + 0x00, 0x02, 0x00, 0x02, 0x04, 0x06, 0x04, 0x06, + 0x08, 0x0a, 0x08, 0x0a, 0x0c, 0x0e, 0x0c, 0x0e, + 0x00, 0x02, 0x00, 0x02, 0x04, 0x06, 0x04, 0x06, + 0x08, 0x0a, 0x08, 0x0a, 0x0c, 0x0e, 0x0c, 0x0e ); - const __m256i p_shuf_23 = _mm256_setr_epi8( - 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, - 0x04, 0x05, 0x0c, 0x0d, 0x05, 0x06, 0x0d, 0x0e, - 0x02, 0x03, 0x0a, 0x0b, 0x03, 0x04, 0x0b, 0x0c, - 0x04, 0x05, 0x0c, 0x0d, 0x05, 0x06, 0x0d, 0x0e + const __m128i r_shuffle = _mm_setr_epi8( + 0x00, 0x02, 0x04, 0x06, 0x01, 0x03, 0x05, 0x07, + 0x08, 0x0a, 0x0c, 0x0e, 0x09, 0x0b, 0x0d, 0x0f ); - const __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a - ); + int16_t f[8][4] = { { 0 } }; + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[0]], sizeof(int16_t) * 4); + memcpy(f[1], cubic_filter[delta_fract[1]], sizeof(int16_t) * 4); + memcpy(f[2], cubic_filter[delta_fract[2]], sizeof(int16_t) * 4); + memcpy(f[3], cubic_filter[delta_fract[3]], sizeof(int16_t) * 4); + memcpy(f[4], cubic_filter[delta_fract[4]], sizeof(int16_t) * 4); + memcpy(f[5], cubic_filter[delta_fract[5]], sizeof(int16_t) * 4); + memcpy(f[6], cubic_filter[delta_fract[6]], sizeof(int16_t) * 4); + memcpy(f[7], cubic_filter[delta_fract[7]], sizeof(int16_t) * 4); + } + else { + for (int x = 0; x < 8; ++x) { + const int16_t offset = (delta_fract[x] >> 1); + f[x][0] = 16 - offset; + f[x][1] = 32 - offset; + f[x][2] = 16 + offset; + f[x][3] = offset; + } + } - const __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e - ); + __m256i vidx = _mm256_setr_epi32(delta_int[0], delta_int[1], + delta_int[2], delta_int[3], + delta_int[4], delta_int[5], + delta_int[6], delta_int[7]); + __m256i weights0 = _mm256_loadu_si256((__m256i*)&f[0]); + __m256i weights1 = _mm256_loadu_si256((__m256i*)&f[4]); - const __m128i r_shuffle = _mm_setr_epi8( - 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, - 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f - ); + weights0 = _mm256_shuffle_epi32(weights0, _MM_SHUFFLE(3, 1, 2, 0)); + weights1 = _mm256_shuffle_epi32(weights1, _MM_SHUFFLE(3, 1, 2, 0)); - int16_t f[8][4] = { { 0 } }; + weights0 = _mm256_permute4x64_epi64(weights0, _MM_SHUFFLE(3, 1, 2, 0)); + weights1 = _mm256_permute4x64_epi64(weights1, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i tmp0 = _mm256_shuffle_epi8(weights0, w_shuf); + __m256i tmp1 = _mm256_shuffle_epi8(weights1, w_shuf); + + __m256i w0 = _mm256_permute2x128_si256(tmp0, tmp1, 0x20); + __m256i w1 = _mm256_permute2x128_si256(tmp0, tmp1, 0x31); // For a 8 width block, height must be at least 2. Handle 2 lines at once for (int y = 0; y < height; y += 2) { // Do 4-tap intra interpolation filtering uvg_pixel* p = (uvg_pixel*)(ref_main + y); + __m256i vp0 = _mm256_i32gather_epi32((const int*)(p + 0), vidx, 1); + __m256i vp1 = _mm256_i32gather_epi32((const int*)(p + 1), vidx, 1); - for (int_fast32_t x = 0; x < width; x += 8) { - if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[x + 0]], 8); - memcpy(f[1], cubic_filter[delta_fract[x + 1]], 8); - memcpy(f[2], cubic_filter[delta_fract[x + 2]], 8); - memcpy(f[3], cubic_filter[delta_fract[x + 3]], 8); - memcpy(f[4], cubic_filter[delta_fract[x + 4]], 8); - memcpy(f[5], cubic_filter[delta_fract[x + 5]], 8); - memcpy(f[6], cubic_filter[delta_fract[x + 6]], 8); - memcpy(f[7], cubic_filter[delta_fract[x + 7]], 8); - } - else { - for (int xx = 0; xx < 8; ++xx) { - const int16_t offset = (delta_fract[x + xx] >> 1); - f[xx][0] = 16 - offset; - f[xx][1] = 32 - offset; - f[xx][2] = 16 + offset; - f[xx][3] = offset; - } - } - - // This solution assumes the delta int values to be 64-bit - // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi32(delta_int[x + 0], delta_int[x + 1], - delta_int[x + 2], delta_int[x + 3], - delta_int[x + 4], delta_int[x + 5], - delta_int[x + 6], delta_int[x + 7]); - __m256i all_weights = _mm256_loadu_si256((__m256i*)f); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + __m256i vp_lo = _mm256_unpacklo_epi16(vp0, vp1); + __m256i vp_hi = _mm256_unpackhi_epi16(vp0, vp1); - __m256i vp = _mm256_i32gather_epi32((const int*)p, vidx, 1); + vp_lo = _mm256_shuffle_epi32(vp_lo, _MM_SHUFFLE(3, 1, 2, 0)); + vp_hi = _mm256_shuffle_epi32(vp_hi, _MM_SHUFFLE(3, 1, 2, 0)); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + __m256i vp_lo64 = _mm256_unpacklo_epi64(vp_lo, vp_hi); + __m256i vp_hi64 = _mm256_unpackhi_epi64(vp_lo, vp_hi); - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); + __m256i dot_01 = _mm256_maddubs_epi16(vp_lo64, w0); + __m256i dot_23 = _mm256_maddubs_epi16(vp_hi64, w1); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + filtered = _mm_shuffle_epi8(filtered, r_shuffle); + filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); - _mm_store_si128((__m128i*)dst, _mm_shuffle_epi8(filtered, r_shuffle)); + _mm_store_si128((__m128i*)dst, filtered); - dst += 16; - } + dst += 16; + } } @@ -1460,7 +1453,7 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 12) * 64 : (67 + 14 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); + const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 12) * 64 : (pred_mode - 67 + 14) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); const int16_t* delta_int = wide_angle_mode ? &delta_int_wide_angle_table[table_offset] : &delta_int_table[table_offset]; const int16_t* delta_fract = wide_angle_mode ? &delta_fract_wide_angle_table[table_offset] : &delta_fract_table[table_offset]; From d78d518bb83857f7862b9ef99e0cafebcaebc856 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 27 Nov 2023 16:24:47 +0200 Subject: [PATCH 044/237] Implement alternate version of horizontal w8 angular intra avx2. This gets rid of most of the permutes and shuffles by using horizontal add. --- src/strategies/avx2/intra-avx2.c | 101 ++++++++++++++++++------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8a55f3a6..82fb6de6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -773,6 +773,43 @@ static const int16_t cubic_filter[32][4] = }; +static const int8_t cubic_filter_8bit[32][4] = +{ + { 0, 64, 0, 0 }, + { -1, 63, 2, 0 }, + { -2, 62, 4, 0 }, + { -2, 60, 7, -1 }, + { -2, 58, 10, -2 }, + { -3, 57, 12, -2 }, + { -4, 56, 14, -2 }, + { -4, 55, 15, -2 }, + { -4, 54, 16, -2 }, + { -5, 53, 18, -2 }, + { -6, 52, 20, -2 }, + { -6, 49, 24, -3 }, + { -6, 46, 28, -4 }, + { -5, 44, 29, -4 }, + { -4, 42, 30, -4 }, + { -4, 39, 33, -4 }, + { -4, 36, 36, -4 }, + { -4, 33, 39, -4 }, + { -4, 30, 42, -4 }, + { -4, 29, 44, -5 }, + { -4, 28, 46, -6 }, + { -3, 24, 49, -6 }, + { -2, 20, 52, -6 }, + { -2, 18, 53, -5 }, + { -2, 16, 54, -4 }, + { -2, 15, 55, -4 }, + { -2, 14, 56, -4 }, + { -2, 12, 57, -3 }, + { -2, 10, 58, -2 }, + { -1, 7, 60, -2 }, + { 0, 4, 62, -2 }, + { 0, 2, 63, -1 }, +}; + + static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 4; @@ -1145,10 +1182,10 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int width = 8; const __m256i w_shuf = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x04, 0x06, 0x04, 0x06, - 0x08, 0x0a, 0x08, 0x0a, 0x0c, 0x0e, 0x0c, 0x0e, - 0x00, 0x02, 0x00, 0x02, 0x04, 0x06, 0x04, 0x06, - 0x08, 0x0a, 0x08, 0x0a, 0x0c, 0x0e, 0x0c, 0x0e + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e ); const __m128i r_shuffle = _mm_setr_epi8( @@ -1156,20 +1193,20 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, 0x08, 0x0a, 0x0c, 0x0e, 0x09, 0x0b, 0x0d, 0x0f ); - int16_t f[8][4] = { { 0 } }; + int8_t f[8][4] = { { 0 } }; if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[0]], sizeof(int16_t) * 4); - memcpy(f[1], cubic_filter[delta_fract[1]], sizeof(int16_t) * 4); - memcpy(f[2], cubic_filter[delta_fract[2]], sizeof(int16_t) * 4); - memcpy(f[3], cubic_filter[delta_fract[3]], sizeof(int16_t) * 4); - memcpy(f[4], cubic_filter[delta_fract[4]], sizeof(int16_t) * 4); - memcpy(f[5], cubic_filter[delta_fract[5]], sizeof(int16_t) * 4); - memcpy(f[6], cubic_filter[delta_fract[6]], sizeof(int16_t) * 4); - memcpy(f[7], cubic_filter[delta_fract[7]], sizeof(int16_t) * 4); + memcpy(f[0], cubic_filter_8bit[delta_fract[0]], sizeof(int8_t) * 4); + memcpy(f[1], cubic_filter_8bit[delta_fract[1]], sizeof(int8_t) * 4); + memcpy(f[2], cubic_filter_8bit[delta_fract[2]], sizeof(int8_t) * 4); + memcpy(f[3], cubic_filter_8bit[delta_fract[3]], sizeof(int8_t) * 4); + memcpy(f[4], cubic_filter_8bit[delta_fract[4]], sizeof(int8_t) * 4); + memcpy(f[5], cubic_filter_8bit[delta_fract[5]], sizeof(int8_t) * 4); + memcpy(f[6], cubic_filter_8bit[delta_fract[6]], sizeof(int8_t) * 4); + memcpy(f[7], cubic_filter_8bit[delta_fract[7]], sizeof(int8_t) * 4); } else { for (int x = 0; x < 8; ++x) { - const int16_t offset = (delta_fract[x] >> 1); + const int8_t offset = (delta_fract[x] >> 1); f[x][0] = 16 - offset; f[x][1] = 32 - offset; f[x][2] = 16 + offset; @@ -1181,20 +1218,10 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, delta_int[2], delta_int[3], delta_int[4], delta_int[5], delta_int[6], delta_int[7]); - __m256i weights0 = _mm256_loadu_si256((__m256i*)&f[0]); - __m256i weights1 = _mm256_loadu_si256((__m256i*)&f[4]); - - weights0 = _mm256_shuffle_epi32(weights0, _MM_SHUFFLE(3, 1, 2, 0)); - weights1 = _mm256_shuffle_epi32(weights1, _MM_SHUFFLE(3, 1, 2, 0)); - - weights0 = _mm256_permute4x64_epi64(weights0, _MM_SHUFFLE(3, 1, 2, 0)); - weights1 = _mm256_permute4x64_epi64(weights1, _MM_SHUFFLE(3, 1, 2, 0)); - - __m256i tmp0 = _mm256_shuffle_epi8(weights0, w_shuf); - __m256i tmp1 = _mm256_shuffle_epi8(weights1, w_shuf); - - __m256i w0 = _mm256_permute2x128_si256(tmp0, tmp1, 0x20); - __m256i w1 = _mm256_permute2x128_si256(tmp0, tmp1, 0x31); + __m256i w0 = _mm256_loadu_si256((__m256i*)&f[0]); + //__m256i w1 = _mm256_loadu_si256((__m256i*)&f[4]); + //w0 = _mm256_shuffle_epi8(w0, w_shuf); + //w1 = _mm256_shuffle_epi8(w1, w_shuf); // For a 8 width block, height must be at least 2. Handle 2 lines at once for (int y = 0; y < height; y += 2) { @@ -1204,25 +1231,17 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i vp0 = _mm256_i32gather_epi32((const int*)(p + 0), vidx, 1); __m256i vp1 = _mm256_i32gather_epi32((const int*)(p + 1), vidx, 1); - __m256i vp_lo = _mm256_unpacklo_epi16(vp0, vp1); - __m256i vp_hi = _mm256_unpackhi_epi16(vp0, vp1); - - vp_lo = _mm256_shuffle_epi32(vp_lo, _MM_SHUFFLE(3, 1, 2, 0)); - vp_hi = _mm256_shuffle_epi32(vp_hi, _MM_SHUFFLE(3, 1, 2, 0)); - - __m256i vp_lo64 = _mm256_unpacklo_epi64(vp_lo, vp_hi); - __m256i vp_hi64 = _mm256_unpackhi_epi64(vp_lo, vp_hi); - - __m256i dot_01 = _mm256_maddubs_epi16(vp_lo64, w0); - __m256i dot_23 = _mm256_maddubs_epi16(vp_hi64, w1); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); + __m256i dot_01 = _mm256_maddubs_epi16(vp0, w0); + __m256i dot_23 = _mm256_maddubs_epi16(vp1, w0); + __m256i sum = _mm256_hadd_epi16(dot_01, dot_23); + //__m256i sum = _mm256_add_epi16(dot_01, dot_23); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); __m128i lo = _mm256_castsi256_si128(sum); __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - filtered = _mm_shuffle_epi8(filtered, r_shuffle); + //filtered = _mm_shuffle_epi8(filtered, r_shuffle); filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); _mm_store_si128((__m128i*)dst, filtered); From b68ef546791997b082723683d3bb75a0f8603686 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 27 Nov 2023 16:59:23 +0200 Subject: [PATCH 045/237] Implement horizontal w16 angular intra avx2. This function is also be used for w32 and w64. --- src/strategies/avx2/intra-avx2.c | 87 +++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 82fb6de6..4e00148d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1181,18 +1181,6 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, { const int width = 8; - const __m256i w_shuf = _mm256_setr_epi8( - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e - ); - - const __m128i r_shuffle = _mm_setr_epi8( - 0x00, 0x02, 0x04, 0x06, 0x01, 0x03, 0x05, 0x07, - 0x08, 0x0a, 0x0c, 0x0e, 0x09, 0x0b, 0x0d, 0x0f - ); - int8_t f[8][4] = { { 0 } }; if (use_cubic) { memcpy(f[0], cubic_filter_8bit[delta_fract[0]], sizeof(int8_t) * 4); @@ -1218,12 +1206,8 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, delta_int[2], delta_int[3], delta_int[4], delta_int[5], delta_int[6], delta_int[7]); - __m256i w0 = _mm256_loadu_si256((__m256i*)&f[0]); - //__m256i w1 = _mm256_loadu_si256((__m256i*)&f[4]); - //w0 = _mm256_shuffle_epi8(w0, w_shuf); - //w1 = _mm256_shuffle_epi8(w1, w_shuf); + __m256i weights = _mm256_loadu_si256((__m256i*)&f[0]); - // For a 8 width block, height must be at least 2. Handle 2 lines at once for (int y = 0; y < height; y += 2) { // Do 4-tap intra interpolation filtering @@ -1231,23 +1215,76 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i vp0 = _mm256_i32gather_epi32((const int*)(p + 0), vidx, 1); __m256i vp1 = _mm256_i32gather_epi32((const int*)(p + 1), vidx, 1); - __m256i dot_01 = _mm256_maddubs_epi16(vp0, w0); - __m256i dot_23 = _mm256_maddubs_epi16(vp1, w0); + __m256i dot_01 = _mm256_maddubs_epi16(vp0, weights); + __m256i dot_23 = _mm256_maddubs_epi16(vp1, weights); __m256i sum = _mm256_hadd_epi16(dot_01, dot_23); - //__m256i sum = _mm256_add_epi16(dot_01, dot_23); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); __m128i lo = _mm256_castsi256_si128(sum); __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - //filtered = _mm_shuffle_epi8(filtered, r_shuffle); filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); _mm_store_si128((__m128i*)dst, filtered); dst += 16; - + } +} + +static void angular_pred_avx2_w16_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) +{ + int8_t f[64][4] = { { 0 } }; + if (use_cubic) { + for (int x = 0; x < width; ++x) { + memcpy(f[x], cubic_filter_8bit[delta_fract[x]], sizeof(int8_t) * 4); + } + } + else { + for (int x = 0; x < width; ++x) { + const int8_t offset = (delta_fract[x] >> 1); + f[x][0] = 16 - offset; + f[x][1] = 32 - offset; + f[x][2] = 16 + offset; + f[x][3] = offset; + } + } + + for (int x = 0; x < width; x += 16) { + __m256i vidx0 = _mm256_setr_epi32(delta_int[x + 0], delta_int[x + 1], + delta_int[x + 2], delta_int[x + 3], + delta_int[x + 4], delta_int[x + 5], + delta_int[x + 6], delta_int[x + 7]); + + __m256i vidx1 = _mm256_setr_epi32(delta_int[x + 8], delta_int[x + 9], + delta_int[x + 10], delta_int[x + 11], + delta_int[x + 12], delta_int[x + 13], + delta_int[x + 14], delta_int[x + 15]); + + __m256i w0 = _mm256_loadu_si256((__m256i*) & f[x + 0]); + __m256i w1 = _mm256_loadu_si256((__m256i*) & f[x + 8]); + + // Width 16, handle one row at a time + for (int y = 0; y < height; ++y) { + + // Do 4-tap intra interpolation filtering + uvg_pixel* p = (uvg_pixel*)(ref_main + y); + __m256i vp0 = _mm256_i32gather_epi32((const int*)p, vidx0, 1); + __m256i vp1 = _mm256_i32gather_epi32((const int*)p, vidx1, 1); + + __m256i dot_01 = _mm256_maddubs_epi16(vp0, w0); + __m256i dot_23 = _mm256_maddubs_epi16(vp1, w1); + __m256i sum = _mm256_hadd_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm_store_si128((__m128i*)(dst + (y * width + x)), filtered); + } } } @@ -1497,9 +1534,9 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_avx2_w4_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 8: angular_pred_avx2_w8_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 16: break; - case 32: break; - case 64: break; + case 16: angular_pred_avx2_w16_hor(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 32: angular_pred_avx2_w16_hor(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 64: angular_pred_avx2_w16_hor(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; From ec29ce749850f2524024da340f6d994f4fd55350 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 28 Nov 2023 15:38:20 +0200 Subject: [PATCH 046/237] Fix horizontal mode pdpc. --- src/strategies/avx2/intra-avx2.c | 48 ++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 4e00148d..9c3a7c1b 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -265,7 +265,7 @@ ALIGNED(32) static const int16_t delta_int_wide_angle_table[1792] = { }; -// TODO: cut out the latter 32 entries due to symmetry +// TODO: cut out the latter 32 entries due to symmetry. Also, cut in half due to vertical symmetry ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1792] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, @@ -1555,20 +1555,44 @@ static void uvg_angular_pred_avx2( } else { // Mode is horizontal or vertical, just copy the pixels. + // NOTE: includes PDPC. // TODO: update outer loop to use height instead of width - for (int_fast32_t y = 0; y < height; ++y) { - for (int_fast32_t x = 0; x < width; ++x) { - dst[y * width + x] = ref_main[x + 1]; + if (vertical_mode) { + for (int_fast32_t y = 0; y < height; ++y) { + for (int_fast32_t x = 0; x < width; ++x) { + dst[y * width + x] = ref_main[x + 1]; + } + if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { + int scale = (log2_width + log2_height - 2) >> 2; + const uvg_pixel top_left = ref_main[0]; + const uvg_pixel left = ref_side[1 + y]; + for (int i = 0; i < MIN(3 << scale, width); i++) { + const int wL = 32 >> (2 * i >> scale); + const uvg_pixel val = dst[y * width + i]; + dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); + } + } } - if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - int scale = (log2_width + log2_height - 2) >> 2; - const uvg_pixel top_left = ref_main[0]; - const uvg_pixel left = ref_side[1 + y]; - for (int i = 0; i < MIN(3 << scale, width); i++) { - const int wL = 32 >> (2 * i >> scale); - const uvg_pixel val = dst[y * width + i]; - dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); + } + else { + const uvg_pixel top_left = ref_main[0]; + int scale = (log2_width + log2_height - 2) >> 2; + for (int_fast32_t x = 0; x < width; ++x) { + for (int y = 0; y < height; ++y) { + dst[y * width + x] = ref_main[y + 1]; + } + + if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { + const uvg_pixel ref_top = ref_side[1 + x]; + for (int yy = 0; yy < MIN(3 << scale, height); ++yy) { + const int wT = 32 >> ((yy * 2) >> scale); + + const uvg_pixel val = dst[yy * width + x]; + dst[yy * width + x] = CLIP_TO_PIXEL(val + (((ref_top - top_left) * wT + 32) >> 6)); + + // pred_samples[x][y] = CLIP((refL[x][y] * wL[x] + refT[x][y] * wT[y] + (64 - wL[x] - wT[y]) * pred_samples[x][y] + 32) >> 6 ) + } } } } From a55cfe0f29cf5553dd28db1d14badfcfd73c2f92 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 29 Nov 2023 13:15:55 +0200 Subject: [PATCH 047/237] Implement avx2 horizontal pdpc for angular modes. --- src/strategies/avx2/intra-avx2.c | 99 +++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 26 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9c3a7c1b..70159364 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -890,10 +890,7 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); + _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); } } } @@ -1202,11 +1199,9 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, } } - __m256i vidx = _mm256_setr_epi32(delta_int[0], delta_int[1], - delta_int[2], delta_int[3], - delta_int[4], delta_int[5], - delta_int[6], delta_int[7]); - __m256i weights = _mm256_loadu_si256((__m256i*)&f[0]); + __m128i tmp = _mm_load_si128((__m128i*)delta_int); + __m256i vidx = _mm256_cvtepi16_epi32(tmp); + __m256i weights = _mm256_loadu_si256((__m256i*)f); for (int y = 0; y < height; y += 2) { @@ -1251,15 +1246,10 @@ static void angular_pred_avx2_w16_hor(uvg_pixel* dst, const uvg_pixel* ref_main, } for (int x = 0; x < width; x += 16) { - __m256i vidx0 = _mm256_setr_epi32(delta_int[x + 0], delta_int[x + 1], - delta_int[x + 2], delta_int[x + 3], - delta_int[x + 4], delta_int[x + 5], - delta_int[x + 6], delta_int[x + 7]); - - __m256i vidx1 = _mm256_setr_epi32(delta_int[x + 8], delta_int[x + 9], - delta_int[x + 10], delta_int[x + 11], - delta_int[x + 12], delta_int[x + 13], - delta_int[x + 14], delta_int[x + 15]); + __m128i tmp0 = _mm_load_si128((__m128i*)&delta_int[x]); + __m128i tmp1 = _mm_load_si128((__m128i*)&delta_int[x + 8]); + __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); + __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); __m256i w0 = _mm256_loadu_si256((__m256i*) & f[x + 0]); __m256i w1 = _mm256_loadu_si256((__m256i*) & f[x + 8]); @@ -1374,14 +1364,72 @@ static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con // TODO: vectorize static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { + //int limit = MIN(3 << scale, height); + + //for (int y = 0; y < limit; ++y) { + // int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; + // int16_t wT = 32 >> ((y << 1) >> scale); + // for (int x = 0; x < width; ++x) { + // int16_t top = ref_side[x + (inv_angle_sum >> 9) + 1]; + // dst[y * width + x] = CLIP_TO_PIXEL((top * wT + (64 - wT) * dst[y * width + x] + 32) >> 6); + // } + //} + int16_t wT[4]; + int16_t ref_top[4][4]; + int limit = MIN(3 << scale, height); + const int log2_width = uvg_g_convert_to_log2[width]; - for (int y = 0; y < limit; ++y) { - int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; - int16_t wT = 32 >> ((y << 1) >> scale); - for (int x = 0; x < width; ++x) { - int16_t top = ref_side[x + (inv_angle_sum >> 9) + 1]; - dst[y * width + x] = CLIP_TO_PIXEL((top * wT + (64 - wT) * dst[y * width + x] + 32) >> 6); + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + __m256i vwT_shuffle = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7); + for (int y = 0; y < limit; y += 4) { + for (int x = 0; x < width; x += 4) { + + for (int yy = 0; yy < 4; ++yy) { + int inv_angle_sum = 256 + (y + yy + 1) * inv_sample_disp; + + // Set weight to zero if limit reached. + // This removes the need to blend results with unmodified values in the end. + wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; + for (int xx = 0; xx < 4; ++xx) { + ref_top[yy][xx] = ref_side[(x + xx) + (inv_angle_sum >> 9) + 1]; + } + } + + __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); + uint64_t quad; + memcpy(&quad, wT, sizeof(quad)); + __m256i vwT = _mm256_set1_epi64x(quad); + vwT = _mm256_shuffle_epi8(vwT, vwT_shuffle); + __m256i accu = _mm256_sub_epi16(vtop, vpred16); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // Need to mask remainder samples on the last iteration when limit % 4 != 0 + //int rem_bits = 8 * (limit - y); + //__m128i ones = _mm_set1_epi32(0xFF); + //__m128i vmask = _mm_slli_epi32(ones, rem_bits); + + //// 0 selects filtered, 1 vdst (unchanged) + //vpred = _mm_blendv_epi8(filtered, vpred, vmask); + + *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); } } } @@ -2645,8 +2693,7 @@ static void uvg_pdpc_planar_dc_avx2( } // Process in 4x4 blocks - // TODO: replace width with height - for (int y = 0; y < width; y += 4) { + for (int y = 0; y < height; y += 4) { for (int x = 0; x < width; x += 4) { uint32_t dw_left; From fc83b198fec5fd84178aa13b8386087985bbe24e Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 1 Dec 2023 16:15:46 +0200 Subject: [PATCH 048/237] Improve vertical and horizontal PDPC --- src/strategies/avx2/intra-avx2.c | 322 +++++++++++++++++++++++++++++-- 1 file changed, 305 insertions(+), 17 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 70159364..e8da831f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1305,7 +1305,7 @@ static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_ } -static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_ver_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { int16_t wL[4]; int16_t left[4][4]; @@ -1360,20 +1360,152 @@ static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con } } +static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +{ + const int width = 4; + int16_t wL[4]; + int16_t left[4][4]; + + int limit = MIN(3 << scale, width); + const int log2_width = uvg_g_convert_to_log2[width]; -// TODO: vectorize -static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + for (int xx = 0; xx < width; ++xx) { + int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; + wL[xx] = xx < limit ? 32 >> ((2 * xx) >> scale) : 0; + + for (int yy = 0; yy < 4; ++yy) { + left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; + } + } + + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + uint64_t quad; + memcpy(&quad, wL, sizeof(quad)); + __m256i vwL = _mm256_set1_epi64x(quad); + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vwL, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +{ + const int width = 8; + ALIGNED(32) int16_t wL[8]; + ALIGNED(32) int16_t left[2][8]; + + int limit = MIN(3 << scale, width); + const int log2_width = uvg_g_convert_to_log2[width]; + + __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + __m128i vidx = _mm_slli_epi64(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + for (int xx = 0; xx < width; ++xx) { + int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; + wL[xx] = xx < limit ? 32 >> ((2 * xx) >> scale) : 0; + + for (int yy = 0; yy < 2; ++yy) { + left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; + } + } + + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + __m128i tmp[2]; + // Duplicate weights + tmp[0] = _mm_load_si128((__m128i*)wL); + tmp[1] = tmp[0]; + __m256i* vwL = (__m256i*)tmp; + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(*vwL, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +{ + + + int limit = MIN(3 << scale, width); + const int log2_width = uvg_g_convert_to_log2[width]; + + __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + __m128i vidx = _mm_slli_epi64(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < limit; x += 16) { + ALIGNED(32) int16_t wL[16] = {0}; + ALIGNED(32) int16_t left[16] = {0}; + for (int xx = 0; x + xx < limit; ++xx) { + int shifted_inv_angle_sum = (256 + (x + xx + 1) * inv_sample_disp) >> 9; + wL[xx] = xx < limit ? 32 >> ((2 * (x + xx)) >> scale) : 0; + left[xx] = ref_side[y + shifted_inv_angle_sum + 1]; + } + + __m128i vdst = _mm_load_si128((const __m128i*)(dst + (y * width + x))); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + __m256i* vwL = (__m256i*)wL; + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(*vwL, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width + x)), filtered); + } + } +} + +static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +{ + switch (width) { + case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, inv_sample_disp); break; + case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, inv_sample_disp); break; + case 16: // 16 width and higher done with the same function + case 32: + case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, inv_sample_disp); break; + default: + assert(false && "Intra PDPC: Invalid width.\n"); + } +} + + +static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { - //int limit = MIN(3 << scale, height); - - //for (int y = 0; y < limit; ++y) { - // int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; - // int16_t wT = 32 >> ((y << 1) >> scale); - // for (int x = 0; x < width; ++x) { - // int16_t top = ref_side[x + (inv_angle_sum >> 9) + 1]; - // dst[y * width + x] = CLIP_TO_PIXEL((top * wT + (64 - wT) * dst[y * width + x] + 32) >> 6); - // } - //} int16_t wT[4]; int16_t ref_top[4][4]; @@ -1384,15 +1516,15 @@ static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con __m128i vidx = _mm_slli_epi32(vseq, log2_width); __m256i v32s = _mm256_set1_epi16(32); __m256i vwT_shuffle = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, - 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, - 6, 7, 6, 7, 6, 7, 6, 7); + 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7); for (int y = 0; y < limit; y += 4) { for (int x = 0; x < width; x += 4) { for (int yy = 0; yy < 4; ++yy) { int inv_angle_sum = 256 + (y + yy + 1) * inv_sample_disp; - + // Set weight to zero if limit reached. // This removes the need to blend results with unmodified values in the end. wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; @@ -1434,6 +1566,162 @@ static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, con } } +static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +{ + const int width = 4; + + int16_t wT[4]; + int16_t ref_top[4][4]; + + int limit = MIN(3 << scale, height); + const int log2_width = uvg_g_convert_to_log2[width]; + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + __m256i vwT_shuffle = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7); + for (int y = 0; y < limit; y += 4) { + for (int yy = 0; yy < 4; ++yy) { + int inv_angle_sum = 256 + (y + yy + 1) * inv_sample_disp; + + // Set weight to zero if limit reached. + // This removes the need to blend results with unmodified values in the end. + wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; + for (int x = 0; x < 4; ++x) { + ref_top[yy][x] = ref_side[(x) + (inv_angle_sum >> 9) + 1]; + } + } + + __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); + uint64_t quad; + memcpy(&quad, wT, sizeof(quad)); + __m256i vwT = _mm256_set1_epi64x(quad); + vwT = _mm256_shuffle_epi8(vwT, vwT_shuffle); + __m256i accu = _mm256_sub_epi16(vtop, vpred16); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +{ + const int width = 8; + + int16_t ref_top[2][8]; + + int limit = MIN(3 << scale, height); + const int log2_width = uvg_g_convert_to_log2[width]; + + __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + __m128i vidx = _mm_slli_epi64(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + for (int y = 0; y < height; y += 2) { + // Set weight to zero if limit reached. + // This removes the need to blend results with unmodified values in the end. + const int16_t wT0 = y + 0 < limit ? 32 >> (2 * (y + 0) >> scale) : 0; + const int16_t wT1 = y + 1 < limit ? 32 >> (2 * (y + 1) >> scale) : 0; + + __m128i vwT[2]; + vwT[0] = _mm_set1_epi16(wT0); + vwT[1] = _mm_set1_epi16(wT1); + + int inv_angle_sum = 256 + (y + 0 + 1) * inv_sample_disp; + ref_top[0][0] = ref_side[0 + (inv_angle_sum >> 9) + 1]; + ref_top[0][1] = ref_side[1 + (inv_angle_sum >> 9) + 1]; + ref_top[0][2] = ref_side[2 + (inv_angle_sum >> 9) + 1]; + ref_top[0][3] = ref_side[3 + (inv_angle_sum >> 9) + 1]; + ref_top[0][4] = ref_side[4 + (inv_angle_sum >> 9) + 1]; + ref_top[0][5] = ref_side[5 + (inv_angle_sum >> 9) + 1]; + ref_top[0][6] = ref_side[6 + (inv_angle_sum >> 9) + 1]; + ref_top[0][7] = ref_side[7 + (inv_angle_sum >> 9) + 1]; + + inv_angle_sum = 256 + (y + 1 + 1) * inv_sample_disp; + ref_top[1][0] = ref_side[0 + (inv_angle_sum >> 9) + 1]; + ref_top[1][1] = ref_side[1 + (inv_angle_sum >> 9) + 1]; + ref_top[1][2] = ref_side[2 + (inv_angle_sum >> 9) + 1]; + ref_top[1][3] = ref_side[3 + (inv_angle_sum >> 9) + 1]; + ref_top[1][4] = ref_side[4 + (inv_angle_sum >> 9) + 1]; + ref_top[1][5] = ref_side[5 + (inv_angle_sum >> 9) + 1]; + ref_top[1][6] = ref_side[6 + (inv_angle_sum >> 9) + 1]; + ref_top[1][7] = ref_side[7 + (inv_angle_sum >> 9) + 1]; + + __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); + + __m256i accu = _mm256_sub_epi16(vtop, vpred16); + accu = _mm256_mullo_epi16(*(__m256i*)vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +{ + int limit = MIN(3 << scale, height); + const int log2_width = uvg_g_convert_to_log2[width]; + __m256i v32s = _mm256_set1_epi16(32); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; + for (int x = 0; x < width; x += 16) { + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m128i vtop = _mm_load_si128((__m128i*)&ref_side[x + (inv_angle_sum >> 9) + 1]); + __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); + + __m256i accu = _mm256_sub_epi16(vtop16, vpred16); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); + } + } +} + +static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +{ + switch (width) { + case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, inv_sample_disp); break; + case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, inv_sample_disp); break; + case 16: // 16 width and higher done with the same function + case 32: + case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, inv_sample_disp); break; + default: + assert(false && "Intra PDPC: Invalid width.\n"); + } +} + static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, From 9f634981d27e6d1720ca4eeb00e8aef552d71059 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 4 Dec 2023 16:44:18 +0200 Subject: [PATCH 049/237] Simplify PDPC function call. --- src/strategies/avx2/intra-avx2.c | 46 +++++++++++++------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e8da831f..33ac244e 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1490,19 +1490,6 @@ static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) -{ - switch (width) { - case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, inv_sample_disp); break; - case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, inv_sample_disp); break; - case 16: // 16 width and higher done with the same function - case 32: - case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, inv_sample_disp); break; - default: - assert(false && "Intra PDPC: Invalid width.\n"); - } -} - static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { @@ -1709,19 +1696,6 @@ static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) -{ - switch (width) { - case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, inv_sample_disp); break; - case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, inv_sample_disp); break; - case 16: // 16 width and higher done with the same function - case 32: - case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, inv_sample_disp); break; - default: - assert(false && "Intra PDPC: Invalid width.\n"); - } -} - static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, @@ -1949,9 +1923,25 @@ static void uvg_angular_pred_avx2( } if (PDPC_filter) { if (vertical_mode) - angular_pdpc_ver_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); + switch (width) { + case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 16: // 16 width and higher done with the same function + case 32: + case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + default: + assert(false && "Intra PDPC: Invalid width.\n"); + } else - angular_pdpc_hor_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); + switch (width) { + case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 16: // 16 width and higher done with the same function + case 32: + case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + default: + assert(false && "Intra PDPC: Invalid width.\n"); + } } } } From be1aa3bc6bd75699fdc83a934eb7499927fde38e Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 4 Dec 2023 17:03:30 +0200 Subject: [PATCH 050/237] Fix mistakes in PDPC w8 horizontal. Limit was not utilized correctly. Improve loading of references. --- src/strategies/avx2/intra-avx2.c | 35 +++++++++----------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 33ac244e..543b3add 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1607,8 +1607,6 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, { const int width = 8; - int16_t ref_top[2][8]; - int limit = MIN(3 << scale, height); const int log2_width = uvg_g_convert_to_log2[width]; @@ -1616,41 +1614,28 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, __m128i vidx = _mm_slli_epi64(vseq, log2_width); __m256i v32s = _mm256_set1_epi16(32); - for (int y = 0; y < height; y += 2) { + for (int y = 0; y < limit; y += 2) { // Set weight to zero if limit reached. // This removes the need to blend results with unmodified values in the end. - const int16_t wT0 = y + 0 < limit ? 32 >> (2 * (y + 0) >> scale) : 0; + const int16_t wT0 = 32 >> (2 * (y + 0) >> scale); // This cannot reach limit, so do not check const int16_t wT1 = y + 1 < limit ? 32 >> (2 * (y + 1) >> scale) : 0; __m128i vwT[2]; vwT[0] = _mm_set1_epi16(wT0); vwT[1] = _mm_set1_epi16(wT1); - int inv_angle_sum = 256 + (y + 0 + 1) * inv_sample_disp; - ref_top[0][0] = ref_side[0 + (inv_angle_sum >> 9) + 1]; - ref_top[0][1] = ref_side[1 + (inv_angle_sum >> 9) + 1]; - ref_top[0][2] = ref_side[2 + (inv_angle_sum >> 9) + 1]; - ref_top[0][3] = ref_side[3 + (inv_angle_sum >> 9) + 1]; - ref_top[0][4] = ref_side[4 + (inv_angle_sum >> 9) + 1]; - ref_top[0][5] = ref_side[5 + (inv_angle_sum >> 9) + 1]; - ref_top[0][6] = ref_side[6 + (inv_angle_sum >> 9) + 1]; - ref_top[0][7] = ref_side[7 + (inv_angle_sum >> 9) + 1]; - - inv_angle_sum = 256 + (y + 1 + 1) * inv_sample_disp; - ref_top[1][0] = ref_side[0 + (inv_angle_sum >> 9) + 1]; - ref_top[1][1] = ref_side[1 + (inv_angle_sum >> 9) + 1]; - ref_top[1][2] = ref_side[2 + (inv_angle_sum >> 9) + 1]; - ref_top[1][3] = ref_side[3 + (inv_angle_sum >> 9) + 1]; - ref_top[1][4] = ref_side[4 + (inv_angle_sum >> 9) + 1]; - ref_top[1][5] = ref_side[5 + (inv_angle_sum >> 9) + 1]; - ref_top[1][6] = ref_side[6 + (inv_angle_sum >> 9) + 1]; - ref_top[1][7] = ref_side[7 + (inv_angle_sum >> 9) + 1]; + __m128i tmp[2]; + int shifted_inv_angle_sum = (256 + (y + 0 + 1) * inv_sample_disp) >> 9; + tmp[0] = _mm_load_si128((__m128i*)&ref_side[shifted_inv_angle_sum + 1]); + + shifted_inv_angle_sum = (256 + (y + 1 + 1) * inv_sample_disp) >> 9; + tmp[1] = _mm_load_si128((__m128i*) &ref_side[shifted_inv_angle_sum + 1]); __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); + __m256i* vtop = (__m256i*)tmp; - __m256i accu = _mm256_sub_epi16(vtop, vpred16); + __m256i accu = _mm256_sub_epi16(*vtop, vpred16); accu = _mm256_mullo_epi16(*(__m256i*)vwT, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); From 6f3950bc5c74e2a49d0b17c05e2b70d2322d013d Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 5 Dec 2023 17:48:59 +0200 Subject: [PATCH 051/237] Implement h4 PDPC. Fix error in w8 horizontal PDPC. --- src/strategies/avx2/intra-avx2.c | 185 ++++++++++++++++++++++++++++--- 1 file changed, 170 insertions(+), 15 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 543b3add..91df174a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1407,9 +1407,7 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) { const int width = 8; - ALIGNED(32) int16_t wL[8]; - ALIGNED(32) int16_t left[2][8]; - + int limit = MIN(3 << scale, width); const int log2_width = uvg_g_convert_to_log2[width]; @@ -1419,12 +1417,14 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // For width 8, height must be at least 2. Handle 2 lines at once. for (int y = 0; y < height; y += 2) { - for (int xx = 0; xx < width; ++xx) { + ALIGNED(32) int16_t wL[8] = {0}; + ALIGNED(32) int16_t left[16] = {0}; + for (int xx = 0; xx < limit; ++xx) { int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; wL[xx] = xx < limit ? 32 >> ((2 * xx) >> scale) : 0; for (int yy = 0; yy < 2; ++yy) { - left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; + left[yy * width +xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; } } @@ -1491,6 +1491,160 @@ static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } +// Height versions of vertical PDPC + +static void angular_pdpc_ver_h4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int scale, const int16_t inv_sample_disp) +{ + const int height = 4; + + int limit = MIN(3 << scale, width); + const int log2_width = uvg_g_convert_to_log2[width]; + + const __m256i v32s = _mm256_set1_epi16(32); + const __m256i wL_shuffle = _mm256_setr_epi8( + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + ); + + for (int x = 0; x < limit; x += 4) { + int shifted_inv_angle_sum[4] = {0}; + int16_t wL[4] = {0}; + ALIGNED(32) uvg_pixel tmp[16]; + for (int xx = 0; xx < 4; ++xx) { + shifted_inv_angle_sum[xx] = (256 + (x + xx + 1) * inv_sample_disp) >> 9; + wL[xx] = (x + xx) < limit ? 32 >> ((2 * (x + xx)) >> scale) : 0; + + tmp[xx * 4 + 0] = ref_side[0 + shifted_inv_angle_sum[xx] + 1]; + tmp[xx * 4 + 1] = ref_side[1 + shifted_inv_angle_sum[xx] + 1]; + tmp[xx * 4 + 2] = ref_side[2 + shifted_inv_angle_sum[xx] + 1]; + tmp[xx * 4 + 3] = ref_side[3 + shifted_inv_angle_sum[xx] + 1]; + + } + + int16_t tmp_dst[16]; + for (int yy = 0; yy < height; ++yy) { + tmp_dst[0 + yy] = dst[yy * width + x + 0]; + tmp_dst[4 + yy] = dst[yy * width + x + 1]; + tmp_dst[8 + yy] = dst[yy * width + x + 2]; + tmp_dst[12 + yy] = dst[yy * width + x + 3]; + } + + __m256i* vdst16 = (__m256i*)tmp_dst; + __m128i vleft = _mm_load_si128((__m128i*)tmp); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + __m256i accu = _mm256_sub_epi16(vleft16, *vdst16); + __m256i vwL = _mm256_setr_epi64x(wL[0], wL[1], wL[2], wL[3]); + vwL = _mm256_shuffle_epi8(vwL, wL_shuffle); + accu = _mm256_mullo_epi16(vwL, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(*vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + const uvg_pixel* result = (uvg_pixel*)&filtered; + + for (int yy = 0; yy < height; ++yy) { + dst[yy * width + x + 0] = result[0 + yy]; + dst[yy * width + x + 1] = result[4 + yy]; + dst[yy * width + x + 2] = result[8 + yy]; + dst[yy * width + x + 3] = result[12 + yy]; + } + } +} + +static void angular_pdpc_ver_h8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int scale, const int16_t inv_sample_disp) +{ + const int height = 8; + + int limit = MIN(3 << scale, width); + __m256i v32s = _mm256_set1_epi16(32); + + for (int x = 0; x < limit; x += 2) { + int shifted_inv_angle_sum0 = (256 + (x + 0 + 1) * inv_sample_disp) >> 9; + int shifted_inv_angle_sum1 = (256 + (x + 1 + 1) * inv_sample_disp) >> 9; + __m128i vwL[2]; + const int16_t wL0 = 32 >> ((2 * (x + 0)) >> scale); + const int16_t wL1 = (x + 1) < limit ? 32 >> ((2 * (x + 1)) >> scale) : 0; + vwL[0] = _mm_set1_epi16(wL0); + vwL[1] = _mm_set1_epi16(wL1); + + ALIGNED(32) int16_t tmp_dst[16]; + for (int yy = 0; yy < height; ++yy) { + tmp_dst[0 + yy] = dst[(yy) * width + x + 0]; + tmp_dst[8 + yy] = dst[(yy) * width + x + 1]; + } + + ALIGNED(32) uvg_pixel left[16]; + memcpy(&left[0], &ref_side[shifted_inv_angle_sum0 + 1], 8 * sizeof(uvg_pixel)); + memcpy(&left[8], &ref_side[shifted_inv_angle_sum1 + 1], 8 * sizeof(uvg_pixel)); + + __m256i vdst16 = _mm256_load_si256((__m256i*)tmp_dst); + __m128i vleft = _mm_load_si128((__m128i*)left); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + __m256i* vwL256 = (__m256i*)vwL; + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(*vwL256, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + const uvg_pixel* result = (uvg_pixel*)&filtered; + for (int yy = 0; yy < height; ++yy) { + dst[(yy) * width + x + 0] = result[0 + yy]; + dst[(yy) * width + x + 1] = result[8 + yy]; + } + + } +} + +static void angular_pdpc_ver_h16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +{ + int limit = MIN(3 << scale, width); + __m256i v32s = _mm256_set1_epi16(32); + + for (int x = 0; x < limit; ++x) { + int shifted_inv_angle_sum = (256 + (x + 1) * inv_sample_disp) >> 9; + const int16_t wL = 32 >> ((2 * x) >> scale); + const __m256i vwL = _mm256_set1_epi16(wL); + + for (int y = 0; y < height; y += 16) { + ALIGNED(32) int16_t tmp_dst[16]; + for (int yy = 0; yy < 16; ++yy) { + tmp_dst[yy] = dst[(y + yy) * width + x]; + } + __m256i vdst16 = _mm256_load_si256((__m256i*)tmp_dst); + __m128i vleft = _mm_loadu_si128((__m128i*)&ref_side[y + shifted_inv_angle_sum + 1]); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vwL, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + const uvg_pixel* result = (uvg_pixel*)&filtered; + for (int yy = 0; yy < 16; ++yy) { + dst[(y + yy) * width + x] = result[yy]; + } + } + } +} + + static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { int16_t wT[4]; @@ -1624,18 +1778,19 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, vwT[0] = _mm_set1_epi16(wT0); vwT[1] = _mm_set1_epi16(wT1); - __m128i tmp[2]; + ALIGNED(32) uvg_pixel tmp[16]; int shifted_inv_angle_sum = (256 + (y + 0 + 1) * inv_sample_disp) >> 9; - tmp[0] = _mm_load_si128((__m128i*)&ref_side[shifted_inv_angle_sum + 1]); + memcpy(&tmp[0], &ref_side[shifted_inv_angle_sum + 1], 8 * sizeof(uvg_pixel)); shifted_inv_angle_sum = (256 + (y + 1 + 1) * inv_sample_disp) >> 9; - tmp[1] = _mm_load_si128((__m128i*) &ref_side[shifted_inv_angle_sum + 1]); + memcpy(&tmp[8], &ref_side[shifted_inv_angle_sum + 1], 8 * sizeof(uvg_pixel)); __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m256i* vtop = (__m256i*)tmp; + __m128i vtop = _mm_load_si128((__m128i*)tmp); + __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); - __m256i accu = _mm256_sub_epi16(*vtop, vpred16); + __m256i accu = _mm256_sub_epi16(vtop16, vpred16); accu = _mm256_mullo_epi16(*(__m256i*)vwT, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); @@ -1908,12 +2063,12 @@ static void uvg_angular_pred_avx2( } if (PDPC_filter) { if (vertical_mode) - switch (width) { - case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; - case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; - case 16: // 16 width and higher done with the same function + switch (height) { + case 4: angular_pdpc_ver_h4_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 8: angular_pdpc_ver_h8_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 16: // 16 height and higher done with the same function case 32: - case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 64: angular_pdpc_ver_h16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; default: assert(false && "Intra PDPC: Invalid width.\n"); } From 22fe8bf80853bc3cadcf1a8892e99be11c52c025 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 18 Dec 2023 16:18:28 +0200 Subject: [PATCH 052/237] Improve pure vertical and horizontal modes. Use memcpy and memset instead of copying pixels individually. --- src/strategies/avx2/intra-avx2.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 91df174a..309d03c8 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2007,15 +2007,17 @@ static void uvg_angular_pred_avx2( // Mode is horizontal or vertical, just copy the pixels. // NOTE: includes PDPC. - // TODO: update outer loop to use height instead of width if (vertical_mode) { + const uvg_pixel top_left = ref_main[0]; + int scale = (log2_width + log2_height - 2) >> 2; for (int_fast32_t y = 0; y < height; ++y) { - for (int_fast32_t x = 0; x < width; ++x) { + memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel)); + /*for (int_fast32_t x = 0; x < width; ++x) { dst[y * width + x] = ref_main[x + 1]; - } + }*/ + + // PDPC if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - int scale = (log2_width + log2_height - 2) >> 2; - const uvg_pixel top_left = ref_main[0]; const uvg_pixel left = ref_side[1 + y]; for (int i = 0; i < MIN(3 << scale, width); i++) { const int wL = 32 >> (2 * i >> scale); @@ -2028,11 +2030,15 @@ static void uvg_angular_pred_avx2( else { const uvg_pixel top_left = ref_main[0]; int scale = (log2_width + log2_height - 2) >> 2; + for (int y = 0; y < height; ++y) { + memset(&dst[y * width], ref_main[y + 1], width * sizeof(uvg_pixel)); + } for (int_fast32_t x = 0; x < width; ++x) { - for (int y = 0; y < height; ++y) { + /*for (int y = 0; y < height; ++y) { dst[y * width + x] = ref_main[y + 1]; - } + }*/ + // PDPC if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { const uvg_pixel ref_top = ref_side[1 + x]; for (int yy = 0; yy < MIN(3 << scale, height); ++yy) { From a946a9e1585da1954d9d82da377ee8fa89b01f8e Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 20 Dec 2023 12:03:33 +0200 Subject: [PATCH 053/237] Further improve pure horizontal and vertical modes. Use switch case with constant values for width. This should autovectorize. --- src/strategies/avx2/intra-avx2.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 309d03c8..1d2ff490 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2011,7 +2011,14 @@ static void uvg_angular_pred_avx2( const uvg_pixel top_left = ref_main[0]; int scale = (log2_width + log2_height - 2) >> 2; for (int_fast32_t y = 0; y < height; ++y) { - memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel)); + switch (width) { + case 4: memcpy(&dst[y * 4], &ref_main[1], 4 * sizeof(uvg_pixel)); break; + case 8: memcpy(&dst[y * 8], &ref_main[1], 8 * sizeof(uvg_pixel)); break; + case 16: memcpy(&dst[y * 16], &ref_main[1], 16 * sizeof(uvg_pixel)); break; + case 32: memcpy(&dst[y * 32], &ref_main[1], 32 * sizeof(uvg_pixel)); break; + case 64: memcpy(&dst[y * 64], &ref_main[1], 64 * sizeof(uvg_pixel)); break; + } + /*for (int_fast32_t x = 0; x < width; ++x) { dst[y * width + x] = ref_main[x + 1]; }*/ @@ -2031,7 +2038,16 @@ static void uvg_angular_pred_avx2( const uvg_pixel top_left = ref_main[0]; int scale = (log2_width + log2_height - 2) >> 2; for (int y = 0; y < height; ++y) { - memset(&dst[y * width], ref_main[y + 1], width * sizeof(uvg_pixel)); + switch (width) { + case 4: memset(&dst[y * 4], ref_main[y + 1], 4 * sizeof(uvg_pixel)); break; + case 8: memset(&dst[y * 8], ref_main[y + 1], 8 * sizeof(uvg_pixel)); break; + case 16: memset(&dst[y * 16], ref_main[y + 1], 16 * sizeof(uvg_pixel)); break; + case 32: memset(&dst[y * 32], ref_main[y + 1], 32 * sizeof(uvg_pixel)); break; + case 64: memset(&dst[y * 64], ref_main[y + 1], 64 * sizeof(uvg_pixel)); break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; + } } for (int_fast32_t x = 0; x < width; ++x) { /*for (int y = 0; y < height; ++y) { From fc2b43c430d0423d57bfc3b67af8c77473a301c0 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 20 Dec 2023 12:12:22 +0200 Subject: [PATCH 054/237] Improve diagonal modes. Use switch case for different widths. --- src/strategies/avx2/intra-avx2.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1d2ff490..d494de06 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1292,14 +1292,17 @@ static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, cons } -// TODO: vectorize static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { for (int y = 0; y < height; ++y) { uvg_pixel* dst_row = dst + y * width; uvg_pixel* ref_row = ref + delta_int[y] + 1; - for (int_fast32_t x = 0; x < width; x += 4) { - memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); + switch (width) { + case 4: memcpy(dst_row, ref_row, 4 * sizeof(uvg_pixel)); break; + case 8: memcpy(dst_row, ref_row, 8 * sizeof(uvg_pixel)); break; + case 16: memcpy(dst_row, ref_row, 16 * sizeof(uvg_pixel)); break; + case 32: memcpy(dst_row, ref_row, 32 * sizeof(uvg_pixel)); break; + case 64: memcpy(dst_row, ref_row, 64 * sizeof(uvg_pixel)); break; } } } From 3b06c96895bee5502e39083cdda6740eb062965d Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 22 Dec 2023 15:54:24 +0200 Subject: [PATCH 055/237] WIP --- src/strategies/avx2/intra-avx2.c | 72 ++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d494de06..54fb5f64 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1279,9 +1279,9 @@ static void angular_pred_avx2_w16_hor(uvg_pixel* dst, const uvg_pixel* ref_main, } -// TODO: vectorize -static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { + // 2-tap filter for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { uvg_pixel ref1 = ref[x + delta_int[y] + 1]; @@ -1292,6 +1292,62 @@ static void angular_pred_avx2_linear_filter(uvg_pixel* dst, uvg_pixel* ref, cons } +static void angular_pred_avx2_linear_filter_ver(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + // 2-tap linear filter + + // Handle filtering in 4x4 blocks + const int16_t* dint = delta_int; + for (int y = 0; y < height; y += 4) { + const __m128i vidx0 = _mm_setr_epi8( + dint[0], dint[0], dint[0], dint[0], + dint[0], dint[0], dint[0], dint[0], + dint[1], dint[1], dint[1], dint[1], + dint[1], dint[1], dint[1], dint[1] + ); + const __m128i vidx1 = _mm_setr_epi8( + dint[2], dint[2], dint[2], dint[2], + dint[2], dint[2], dint[2], dint[2], + dint[3], dint[3], dint[3], dint[3], + dint[3], dint[3], dint[3], dint[3] + ); + dint += 4; + + __m128i vshuffle0 = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04 + ); + + vshuffle0 = _mm_add_epi8(vshuffle0, vidx0); + const __m128i vshuffle1 = _mm_add_epi8(vshuffle0, vidx1); + int8_t tmp[2] = { -delta_fract[y], delta_fract[y] }; + const __m128i vcoeff = _mm_set1_epi16(*(int16_t*)tmp); + + for (int x = 0; x < width; x += 4) { + const __m128i vref = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + x + 1]); + const __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + const __m128i vref0 = _mm_shuffle_epi8(vref, vshuffle0); + const __m128i vref1 = _mm_shuffle_epi8(vref, vshuffle1); + } + } +} + + +static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + // 2-tap linear filter + + // Handle filtering in 4x4 blocks + for (int y = 0; y < height; y += 4) { + const __m256i vref = _mm256_loadu_si256((const __m256i*) & ref[y + 1]); + for (int x = 0; x < width; x += 4) { + + } + } +} + + static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { for (int y = 0; y < height; ++y) { @@ -1859,6 +1915,12 @@ static void uvg_angular_pred_avx2( const int log2_height = uvg_g_convert_to_log2[height]; assert((log2_width >= 2 && log2_width <= 6) && (log2_height >= 0 && log2_height <= 6)); + + // For chroma blocks, height has to be at least 2 + if (channel_type != COLOR_Y) { + assert(log2_height >= 1); + } + // Modes [-1, -14] and [67, 80] are wide angle modes assert(intra_mode >= -14 && intra_mode <= 80); @@ -1996,9 +2058,13 @@ static void uvg_angular_pred_avx2( } } } + // Chroma channels else { // Do linear filtering for chroma channels - angular_pred_avx2_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); + if (vertical_mode) + angular_pred_avx2_linear_filter_ver(dst, ref_main, width, height, delta_int, delta_fract); + else + angular_pred_avx2_linear_filter_hor(dst, ref_main, height, width, delta_int, delta_fract); } } else { From 25e0ea4b5a3ad6c727242ee77fd3ad2ac8a2163b Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 22 Dec 2023 15:54:24 +0200 Subject: [PATCH 056/237] Implement intra avx2 chroma linear filtering w4. NOTE: the memory access pattern is slow and it sucks. Redo it at some point. --- src/strategies/avx2/intra-avx2.c | 160 +++++++++++++++++++++---------- 1 file changed, 112 insertions(+), 48 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 54fb5f64..8977fbf7 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -773,7 +773,9 @@ static const int16_t cubic_filter[32][4] = }; -static const int8_t cubic_filter_8bit[32][4] = +// Specified in JVET-T2001 8.4.5.2.13 Table 25 +// These are the fC interpolation filter coefficients +static const int8_t cubic_filter_8bit_c[32][4] = { { 0, 64, 0, 0 }, { -1, 63, 2, 0 }, @@ -809,6 +811,44 @@ static const int8_t cubic_filter_8bit[32][4] = { 0, 2, 63, -1 }, }; +// Specified in JVET-T2001 8.4.5.2.13 Table 25 +// These are the fG interpolation filter coefficients +static const int8_t cubic_filter_8bit_g[32][4] = +{ + {16, 32, 16, 0}, + {16, 32, 16, 0}, + {15, 31, 17, 1}, + {15, 31, 17, 1}, + {14, 30, 18, 2}, + {14, 30, 18, 2}, + {13, 29, 19, 3}, + {13, 29, 19, 3}, + {12, 28, 20, 4}, + {12, 28, 20, 4}, + {11, 27, 21, 5}, + {11, 27, 21, 5}, + {10, 26, 22, 6}, + {10, 26, 22, 6}, + { 9, 25, 23, 7}, + { 9, 25, 23, 7}, + { 8, 24, 24, 8}, + { 8, 24, 24, 8}, + { 7, 23, 25, 9}, + { 7, 23, 25, 9}, + { 6, 22, 26, 10}, + { 6, 22, 26, 10}, + { 5, 21, 27, 11}, + { 5, 21, 27, 11}, + { 4, 20, 28, 12}, + { 4, 20, 28, 12}, + { 3, 19, 29, 13}, + { 3, 19, 29, 13}, + { 2, 18, 30, 14}, + { 2, 18, 30, 14}, + { 1, 17, 31, 15}, + { 1, 17, 31, 15} +}; + static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { @@ -1180,14 +1220,14 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, int8_t f[8][4] = { { 0 } }; if (use_cubic) { - memcpy(f[0], cubic_filter_8bit[delta_fract[0]], sizeof(int8_t) * 4); - memcpy(f[1], cubic_filter_8bit[delta_fract[1]], sizeof(int8_t) * 4); - memcpy(f[2], cubic_filter_8bit[delta_fract[2]], sizeof(int8_t) * 4); - memcpy(f[3], cubic_filter_8bit[delta_fract[3]], sizeof(int8_t) * 4); - memcpy(f[4], cubic_filter_8bit[delta_fract[4]], sizeof(int8_t) * 4); - memcpy(f[5], cubic_filter_8bit[delta_fract[5]], sizeof(int8_t) * 4); - memcpy(f[6], cubic_filter_8bit[delta_fract[6]], sizeof(int8_t) * 4); - memcpy(f[7], cubic_filter_8bit[delta_fract[7]], sizeof(int8_t) * 4); + memcpy(f[0], cubic_filter_8bit_c[delta_fract[0]], sizeof(int8_t) * 4); + memcpy(f[1], cubic_filter_8bit_c[delta_fract[1]], sizeof(int8_t) * 4); + memcpy(f[2], cubic_filter_8bit_c[delta_fract[2]], sizeof(int8_t) * 4); + memcpy(f[3], cubic_filter_8bit_c[delta_fract[3]], sizeof(int8_t) * 4); + memcpy(f[4], cubic_filter_8bit_c[delta_fract[4]], sizeof(int8_t) * 4); + memcpy(f[5], cubic_filter_8bit_c[delta_fract[5]], sizeof(int8_t) * 4); + memcpy(f[6], cubic_filter_8bit_c[delta_fract[6]], sizeof(int8_t) * 4); + memcpy(f[7], cubic_filter_8bit_c[delta_fract[7]], sizeof(int8_t) * 4); } else { for (int x = 0; x < 8; ++x) { @@ -1232,7 +1272,7 @@ static void angular_pred_avx2_w16_hor(uvg_pixel* dst, const uvg_pixel* ref_main, int8_t f[64][4] = { { 0 } }; if (use_cubic) { for (int x = 0; x < width; ++x) { - memcpy(f[x], cubic_filter_8bit[delta_fract[x]], sizeof(int8_t) * 4); + memcpy(f[x], cubic_filter_8bit_c[delta_fract[x]], sizeof(int8_t) * 4); } } else { @@ -1286,50 +1326,56 @@ static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, c for (int x = 0; x < width; ++x) { uvg_pixel ref1 = ref[x + delta_int[y] + 1]; uvg_pixel ref2 = ref[x + delta_int[y] + 2]; - dst[y * width + x] = ref1 + ((delta_fract[y] * (ref2 - ref1) + 16) >> 5); + //dst[y * width + x] = ref1 + ((delta_fract[y] * (ref2 - ref1) + 16) >> 5); + dst[y * width + x] = ((32 - delta_fract[y]) * ref1 + delta_fract[y] * ref2 + 16) >> 5; } } } -static void angular_pred_avx2_linear_filter_ver(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) { - // 2-tap linear filter - - // Handle filtering in 4x4 blocks + const int width = 4; const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { - const __m128i vidx0 = _mm_setr_epi8( - dint[0], dint[0], dint[0], dint[0], - dint[0], dint[0], dint[0], dint[0], - dint[1], dint[1], dint[1], dint[1], - dint[1], dint[1], dint[1], dint[1] - ); - const __m128i vidx1 = _mm_setr_epi8( - dint[2], dint[2], dint[2], dint[2], - dint[2], dint[2], dint[2], dint[2], - dint[3], dint[3], dint[3], dint[3], - dint[3], dint[3], dint[3], dint[3] - ); + uvg_pixel src[32]; + int16_t coeff_tmp[4]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int yy = 0; yy < 4; ++yy) { + src[yy * 8 + 0] = ref[dint[yy] + 1 + 0]; + src[yy * 8 + 1] = ref[dint[yy] + 1 + 1]; + src[yy * 8 + 2] = ref[dint[yy] + 1 + 1]; + src[yy * 8 + 3] = ref[dint[yy] + 1 + 2]; + src[yy * 8 + 4] = ref[dint[yy] + 1 + 2]; + src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; + src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; + src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; + int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; + coeff_tmp[yy] = *(int16_t*)tmp; + } dint += 4; - __m128i vshuffle0 = _mm_setr_epi8( - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04 - ); - - vshuffle0 = _mm_add_epi8(vshuffle0, vidx0); - const __m128i vshuffle1 = _mm_add_epi8(vshuffle0, vidx1); - int8_t tmp[2] = { -delta_fract[y], delta_fract[y] }; - const __m128i vcoeff = _mm_set1_epi16(*(int16_t*)tmp); - - for (int x = 0; x < width; x += 4) { - const __m128i vref = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + x + 1]); - const __m256i vref16 = _mm256_cvtepu8_epi16(vref); + int8_t tmp[2] = {32 - delta_fract[y], delta_fract[y]}; + const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], + coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); + const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], + coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); + //const __m256i vcoeff = _mm256_set1_epi16(*(int16_t*)tmp); - const __m128i vref0 = _mm_shuffle_epi8(vref, vshuffle0); - const __m128i vref1 = _mm_shuffle_epi8(vref, vshuffle1); - } + const __m128i* vsrc0 = (const __m128i*)&src[0]; + const __m128i* vsrc1 = (const __m128i*)&src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; } } @@ -2060,11 +2106,29 @@ static void uvg_angular_pred_avx2( } // Chroma channels else { - // Do linear filtering for chroma channels - if (vertical_mode) - angular_pred_avx2_linear_filter_ver(dst, ref_main, width, height, delta_int, delta_fract); - else - angular_pred_avx2_linear_filter_hor(dst, ref_main, height, width, delta_int, delta_fract); + // Do 2-tap linear filtering for chroma channels + if (vertical_mode) { + switch (width) { + case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, delta_fract); break; + case 8: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + case 16: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + case 32: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + default: + assert(false && "Intra angular predicion: illegal chroma width.\n"); + break; + } + } + else { + switch (width) { // TODO: this generic solution does not work for horizontal modes. Start by implementing the vertical prediction first + case 4: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + case 8: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + case 16: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + case 32: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + default: + assert(false && "Intra angular predicion: illegal chroma width.\n"); + break; + } + } } } else { From f95727cfb5a784676240e629ab51ed9bed910da2 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 19 Jan 2024 01:30:25 +0200 Subject: [PATCH 057/237] Implement intra avx2 chroma linear filtering w8 and w16. Width 32 can use the w16 function. NOTE: redo the memory access for each function. --- src/strategies/avx2/intra-avx2.c | 95 ++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8977fbf7..01b06b6b 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1335,9 +1335,10 @@ static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, c static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) { - const int width = 4; const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); + // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { uvg_pixel src[32]; @@ -1352,12 +1353,12 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; - int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; + int8_t tmp[2] = { 32 - *dfract, *dfract }; + dfract++; coeff_tmp[yy] = *(int16_t*)tmp; } dint += 4; - int8_t tmp[2] = {32 - delta_fract[y], delta_fract[y]}; const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], @@ -1380,6 +1381,88 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re } +static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 8; + const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; + const __m128i v16s = _mm_set1_epi16(16); + + // Height has to be at least 2, handle 2 lines at once + for (int y = 0; y < height; y += 2) { + uvg_pixel src[32]; + int16_t coeff_tmp[2]; + // TODO: faster memory access + for (int yy = 0; yy < 2; ++yy) { + const int offset = yy * 16; + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[offset + d + 0] = ref[dint[yy] + 1 + x + 0]; + src[offset + d + 1] = ref[dint[yy] + 1 + x + 1]; + } + int8_t tmp[2] = { 32 - *dfract, *dfract }; + dfract++; + coeff_tmp[yy] = *(int16_t*)tmp; + } + dint += 2; + + const __m128i vcoeff0 = _mm_set1_epi16(coeff_tmp[0]); + const __m128i vcoeff1 = _mm_set1_epi16(coeff_tmp[1]); + + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; + const __m128i v16s = _mm_set1_epi16(16); + + // TODO: modify to handle 2 lines at a time. Use __m256 vectors instead. We are dealing with chroma, so height must be at least 2 anyway + // Handle 1 line at a time + for (int y = 0; y < height; ++y) { + int8_t tmp[2] = { 32 - dfract[y], dfract[y]}; + const int16_t coeff_tmp = *(int16_t*)tmp; + const __m128i vcoeff = _mm_set1_epi16(coeff_tmp); + + for (int x = 0; x < width; x += 16) { + uvg_pixel src[32]; + // TODO: faster memory access + for (int xx = 0, d = 0; xx < 16; ++xx, d += 2) { + src[d + 0] = ref[dint[y] + 1 + x + xx + 0]; + src[d + 1] = ref[dint[y] + 1 + x + xx + 1]; + } + + const __m128i* vsrc0 = (const __m128i*)&src[0]; + const __m128i* vsrc1 = (const __m128i*)&src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } + } +} + + static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { // 2-tap linear filter @@ -2110,9 +2193,9 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, delta_fract); break; - case 8: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; - case 16: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; - case 32: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, width, height, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, width, height, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From ad284cba6d8f547a65c714065a5d7f899f1f3c77 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 22 Jan 2024 22:30:38 +0200 Subject: [PATCH 058/237] Implement intra avx2 chroma linear filtering w32. Improve memory access of w16. --- src/strategies/avx2/intra-avx2.c | 88 ++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 01b06b6b..6b3eb0cf 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1425,40 +1425,74 @@ static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) { - const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); - // TODO: modify to handle 2 lines at a time. Use __m256 vectors instead. We are dealing with chroma, so height must be at least 2 anyway // Handle 1 line at a time for (int y = 0; y < height; ++y) { - int8_t tmp[2] = { 32 - dfract[y], dfract[y]}; - const int16_t coeff_tmp = *(int16_t*)tmp; - const __m128i vcoeff = _mm_set1_epi16(coeff_tmp); + int8_t tmp0[2] = { 32 - delta_fract[y + 0], delta_fract[y + 0]}; + int16_t coeff_tmp = *(int16_t*)tmp0; + __m128i vcoeff = _mm_set1_epi16(coeff_tmp); - for (int x = 0; x < width; x += 16) { - uvg_pixel src[32]; - // TODO: faster memory access - for (int xx = 0, d = 0; xx < 16; ++xx, d += 2) { - src[d + 0] = ref[dint[y] + 1 + x + xx + 0]; - src[d + 1] = ref[dint[y] + 1 + x + xx + 1]; - } + __m128i vsrc0 = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + 0 + 1]); + __m128i vsrc1 = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + 8 + 1]); + + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} - const __m128i* vsrc0 = (const __m128i*)&src[0]; - const __m128i* vsrc1 = (const __m128i*)&src[16]; - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); +static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const __m256i v16s = _mm256_set1_epi16(16); + const __m256i vshuf = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; - } + // Handle 1 line at a time + for (int y = 0; y < height; ++y) { + int8_t tmp0[2] = { 32 - delta_fract[y + 0], delta_fract[y + 0] }; + int16_t coeff_tmp = *(int16_t*)tmp0; + __m256i vcoeff = _mm256_set1_epi16(coeff_tmp); + + __m128i vsrc[4]; + vsrc[0] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 0 + 1]); + vsrc[1] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 16 + 1]); // Flip these two middle sources. They will be later flipped back into place by packus + vsrc[2] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 8 + 1]); + vsrc[3] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 24 + 1]); + + __m256i* vsrc256 = (__m256i*)vsrc; + vsrc256[0] = _mm256_shuffle_epi8(vsrc256[0], vshuf); + vsrc256[1] = _mm256_shuffle_epi8(vsrc256[1], vshuf); + + __m256i res0 = _mm256_maddubs_epi16(vsrc256[0], vcoeff); + __m256i res1 = _mm256_maddubs_epi16(vsrc256[1], vcoeff); + res0 = _mm256_add_epi16(res0, v16s); + res1 = _mm256_add_epi16(res1, v16s); + res0 = _mm256_srai_epi16(res0, 5); + res1 = _mm256_srai_epi16(res1, 5); + + _mm256_store_si256((__m256i*)dst, _mm256_packus_epi16(res0, res1)); + dst += 32; } } @@ -2194,8 +2228,8 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, delta_fract); break; case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, delta_fract); break; - case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, width, height, delta_int, delta_fract); break; - case 32: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, width, height, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From ff5c9bc113d30c549139675d5331983cb6ed351b Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 22 Jan 2024 22:41:06 +0200 Subject: [PATCH 059/237] Improve memory access of w8. --- src/strategies/avx2/intra-avx2.c | 40 ++++++++++++++------------------ 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 6b3eb0cf..c84d14dd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1384,36 +1384,32 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 8; - const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); // Height has to be at least 2, handle 2 lines at once for (int y = 0; y < height; y += 2) { - uvg_pixel src[32]; - int16_t coeff_tmp[2]; - // TODO: faster memory access - for (int yy = 0; yy < 2; ++yy) { - const int offset = yy * 16; - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[offset + d + 0] = ref[dint[yy] + 1 + x + 0]; - src[offset + d + 1] = ref[dint[yy] + 1 + x + 1]; - } - int8_t tmp[2] = { 32 - *dfract, *dfract }; - dfract++; - coeff_tmp[yy] = *(int16_t*)tmp; - } - dint += 2; + int8_t tmp[2] = {32 - delta_fract[y + 0], delta_fract[y + 0]}; + int16_t coeff_tmp0 = *(int16_t*)tmp; + tmp[0] = 32 - delta_fract[y + 1]; + tmp[1] = delta_fract[y + 1]; + int16_t coeff_tmp1 = *(int16_t*)tmp; + - const __m128i vcoeff0 = _mm_set1_epi16(coeff_tmp[0]); - const __m128i vcoeff1 = _mm_set1_epi16(coeff_tmp[1]); + __m128i vsrc0 = _mm_loadu_si128((const __m128i*) & ref[delta_int[y + 0] + 1]); + __m128i vsrc1 = _mm_loadu_si128((const __m128i*) & ref[delta_int[y + 1] + 1]); + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; + const __m128i vcoeff0 = _mm_set1_epi16(coeff_tmp0); + const __m128i vcoeff1 = _mm_set1_epi16(coeff_tmp1); - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); From 51d3bcb1b8d863666907e7af35dc511283bc77d4 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 23 Jan 2024 16:52:18 +0200 Subject: [PATCH 060/237] Improve memory access of w4. Hardcoded coefficients for speed testing purposes. Pre-made tables make the w4 function roughly 4x faster. --- src/strategies/avx2/intra-avx2.c | 43 +++++++++++-------------- src/strategies/avx2/intra_avx2_tables.h | 11 +++++++ 2 files changed, 29 insertions(+), 25 deletions(-) create mode 100644 src/strategies/avx2/intra_avx2_tables.h diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c84d14dd..4a5d0e67 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -45,6 +45,7 @@ #include "global.h" #include "intra-avx2.h" +#include "intra_avx2_tables.h" #include "strategyselector.h" #include "strategies/missing-intel-intrinsics.h" @@ -1338,38 +1339,30 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re const int16_t* dint = delta_int; const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); + const __m256i vshuf = _mm256_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + const __m128i vsub = _mm_set1_epi16(0x0020); // 32 and 0 as 8-bit signed integers // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { - uvg_pixel src[32]; - int16_t coeff_tmp[4]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int yy = 0; yy < 4; ++yy) { - src[yy * 8 + 0] = ref[dint[yy] + 1 + 0]; - src[yy * 8 + 1] = ref[dint[yy] + 1 + 1]; - src[yy * 8 + 2] = ref[dint[yy] + 1 + 1]; - src[yy * 8 + 3] = ref[dint[yy] + 1 + 2]; - src[yy * 8 + 4] = ref[dint[yy] + 1 + 2]; - src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; - src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; - src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; - int8_t tmp[2] = { 32 - *dfract, *dfract }; - dfract++; - coeff_tmp[yy] = *(int16_t*)tmp; - } + const __m256i vidx = _mm256_setr_epi64x(dint[0]+1, dint[1]+1, dint[2]+1, dint[3]+1); dint += 4; - const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], - coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); - const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], - coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); - //const __m256i vcoeff = _mm256_set1_epi16(*(int16_t*)tmp); + const __m128i vcoeff = _mm_load_si128((const __m128i*)intra_chroma_linear_interpolation_w4_m40); + + __m256i vsrc; + vsrc = _mm256_i64gather_epi64((const long long int*)ref, vidx, 1); + vsrc = _mm256_shuffle_epi8(vsrc, vshuf); - const __m128i* vsrc0 = (const __m128i*)&src[0]; - const __m128i* vsrc1 = (const __m128i*)&src[16]; + __m128i vsrc0 = _mm256_extracti128_si256(vsrc, 0); + __m128i vsrc1 = _mm256_extracti128_si256(vsrc, 1); - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h new file mode 100644 index 00000000..86886f0f --- /dev/null +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -0,0 +1,11 @@ +#ifndef INTRA_AVX2_TABLES_H +#define INTRA_AVX2_TABLES_H + +#include "global.h" + +// Test table +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m40[] = { + 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, +}; + +#endif INTRA_AVX2_TABLES_H From 8a475cd74be5e0fb396a6547b1c6159873c9ee77 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 25 Jan 2024 18:26:28 +0200 Subject: [PATCH 061/237] WIP Implement intra chroma linear interpolation w4 coefficient handling. Add coefficient tables. --- src/strategies/avx2/intra-avx2.c | 12 +- src/strategies/avx2/intra_avx2_tables.h | 267 ++++++++++++++++++++++++ 2 files changed, 275 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 4a5d0e67..7751deba 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1334,10 +1334,10 @@ static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, c } -static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +// Linear interpolation filter for width 4 has a different call, since it uses premade tables for coefficients +static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const bool wide_angle_mode, const int32_t pred_mode) { const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); const __m256i vshuf = _mm256_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -1347,12 +1347,16 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re ); const __m128i vsub = _mm_set1_epi16(0x0020); // 32 and 0 as 8-bit signed integers + // TODO: Different modes have differing amounts of unique coefficient vectors. Make an offset table, indexed by mode to get correct table offset. + const int table_offset = wide_angle_mode ? 0 : (pred_mode <= 34 ? (pred_mode - 2) * 16 : (66 - pred_mode) * 16); + + // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { const __m256i vidx = _mm256_setr_epi64x(dint[0]+1, dint[1]+1, dint[2]+1, dint[3]+1); dint += 4; - const __m128i vcoeff = _mm_load_si128((const __m128i*)intra_chroma_linear_interpolation_w4_m40); + const __m128i vcoeff = _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_w4[table_offset]); __m256i vsrc; vsrc = _mm256_i64gather_epi64((const long long int*)ref, vidx, 1); @@ -2215,7 +2219,7 @@ static void uvg_angular_pred_avx2( // Do 2-tap linear filtering for chroma channels if (vertical_mode) { switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, delta_fract); break; + case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, wide_angle_mode, pred_mode); break; case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, delta_fract); break; case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, delta_fract); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 86886f0f..d595de2a 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -8,4 +8,271 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m40[] = { 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, }; +// The number of unique 128-bit coefficient vectors for a given prediction mode. Applicable for width 4 chroma linear interpolation. +const int8_t coeff_vector128_num_by_mode[33] = { + 1, 16, 8, 16, 4, 8, 1, 8, 4, 8, 2, 8, 4, 16, 8, 16, + 1, 16, 8, 16, 4, 8, 2, 8, 4, 8, 1, 8, 4, 16, 8, 16, 1 +}; + +// Chroma linear interpolation coefficients for width 4. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 + 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, +15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, +21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, +27, 5, 27, 5, 27, 5, 27, 5, 30, 2, 30, 2, 30, 2, 30, 2, + 1, 31, 1, 31, 1, 31, 1, 31, 4, 28, 4, 28, 4, 28, 4, 28, + 7, 25, 7, 25, 7, 25, 7, 25, 10, 22, 10, 22, 10, 22, 10, 22, +13, 19, 13, 19, 13, 19, 13, 19, 16, 16, 16, 16, 16, 16, 16, 16, +19, 13, 19, 13, 19, 13, 19, 13, 22, 10, 22, 10, 22, 10, 22, 10, +25, 7, 25, 7, 25, 7, 25, 7, 28, 4, 28, 4, 28, 4, 28, 4, +31, 1, 31, 1, 31, 1, 31, 1, 2, 30, 2, 30, 2, 30, 2, 30, + 5, 27, 5, 27, 5, 27, 5, 27, 8, 24, 8, 24, 8, 24, 8, 24, +11, 21, 11, 21, 11, 21, 11, 21, 14, 18, 14, 18, 14, 18, 14, 18, +17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, +23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, +29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 4 +18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, +30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, +10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, +22, 10, 22, 10, 22, 10, 22, 10, 28, 4, 28, 4, 28, 4, 28, 4, + 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, +14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, +26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, + 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 5 +27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, +13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, +31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, +17, 15, 17, 15, 17, 15, 17, 15, 26, 6, 26, 6, 26, 6, 26, 6, + 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, +21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, + 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, +25, 7, 25, 7, 25, 7, 25, 7, 2, 30, 2, 30, 2, 30, 2, 30, +11, 21, 11, 21, 11, 21, 11, 21, 20, 12, 20, 12, 20, 12, 20, 12, +29, 3, 29, 3, 29, 3, 29, 3, 6, 26, 6, 26, 6, 26, 6, 26, +15, 17, 15, 17, 15, 17, 15, 17, 24, 8, 24, 8, 24, 8, 24, 8, + 1, 31, 1, 31, 1, 31, 1, 31, 10, 22, 10, 22, 10, 22, 10, 22, +19, 13, 19, 13, 19, 13, 19, 13, 28, 4, 28, 4, 28, 4, 28, 4, + 5, 27, 5, 27, 5, 27, 5, 27, 14, 18, 14, 18, 14, 18, 14, 18, +23, 9, 23, 9, 23, 9, 23, 9, 32, 0, 32, 0, 32, 0, 32, 0, +12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 6 + 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, +28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, +20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, +14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 7 +10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, + 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, + 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, +30, 2, 30, 2, 30, 2, 30, 2, 12, 20, 12, 20, 12, 20, 12, 20, +26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, +22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, +18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, +16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 8 +18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 9 +22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, +26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, +30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, + 2, 30, 2, 30, 2, 30, 2, 30, 20, 12, 20, 12, 20, 12, 20, 12, + 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, +10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, +14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, +20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 10 +28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, + 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, +12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, +22, 10, 22, 10, 22, 10, 22, 10, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 11 + 2, 30, 2, 30, 2, 30, 2, 30, 24, 8, 24, 8, 24, 8, 24, 8, +14, 18, 14, 18, 14, 18, 14, 18, 4, 28, 4, 28, 4, 28, 4, 28, +26, 6, 26, 6, 26, 6, 26, 6, 16, 16, 16, 16, 16, 16, 16, 16, + 6, 26, 6, 26, 6, 26, 6, 26, 28, 4, 28, 4, 28, 4, 28, 4, +18, 14, 18, 14, 18, 14, 18, 14, 8, 24, 8, 24, 8, 24, 8, 24, +30, 2, 30, 2, 30, 2, 30, 2, 20, 12, 20, 12, 20, 12, 20, 12, +10, 22, 10, 22, 10, 22, 10, 22, 32, 0, 32, 0, 32, 0, 32, 0, +24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 12 + 8, 24, 8, 24, 8, 24, 8, 24, 32, 0, 32, 0, 32, 0, 32, 0, +26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 13 +14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, +22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, +10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, +30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, +18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, + 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, +28, 4, 28, 4, 28, 4, 28, 4, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 14 +20, 12, 20, 12, 20, 12, 20, 12, 16, 16, 16, 16, 16, 16, 16, 16, +12, 20, 12, 20, 12, 20, 12, 20, 8, 24, 8, 24, 8, 24, 8, 24, + 4, 28, 4, 28, 4, 28, 4, 28, 32, 0, 32, 0, 32, 0, 32, 0, +29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 15 +23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, +17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, +11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, +31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, +25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, +19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, +13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, + 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, + 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, +27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, +21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, +15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, + 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, + 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, +30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 16 +26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, +22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, +18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, +14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, +10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, + 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, + 2, 30, 2, 30, 2, 30, 2, 30, 32, 0, 32, 0, 32, 0, 32, 0, +31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, // Mode 17 +29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, +27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, +25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, +23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, +21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, +19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, +17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, +15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, +13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, +11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, + 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, + 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, + 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, + 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, + 1, 31, 1, 31, 1, 31, 1, 31, 32, 0, 32, 0, 32, 0, 32, 0, +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 1, 31, 1, 31, 1, 31, 2, 30, 2, 30, 2, 30, 2, 30, // Mode 19 + 3, 29, 3, 29, 3, 29, 3, 29, 4, 28, 4, 28, 4, 28, 4, 28, + 5, 27, 5, 27, 5, 27, 5, 27, 6, 26, 6, 26, 6, 26, 6, 26, + 7, 25, 7, 25, 7, 25, 7, 25, 8, 24, 8, 24, 8, 24, 8, 24, + 9, 23, 9, 23, 9, 23, 9, 23, 10, 22, 10, 22, 10, 22, 10, 22, +11, 21, 11, 21, 11, 21, 11, 21, 12, 20, 12, 20, 12, 20, 12, 20, +13, 19, 13, 19, 13, 19, 13, 19, 14, 18, 14, 18, 14, 18, 14, 18, +15, 17, 15, 17, 15, 17, 15, 17, 16, 16, 16, 16, 16, 16, 16, 16, +17, 15, 17, 15, 17, 15, 17, 15, 18, 14, 18, 14, 18, 14, 18, 14, +19, 13, 19, 13, 19, 13, 19, 13, 20, 12, 20, 12, 20, 12, 20, 12, +21, 11, 21, 11, 21, 11, 21, 11, 22, 10, 22, 10, 22, 10, 22, 10, +23, 9, 23, 9, 23, 9, 23, 9, 24, 8, 24, 8, 24, 8, 24, 8, +25, 7, 25, 7, 25, 7, 25, 7, 26, 6, 26, 6, 26, 6, 26, 6, +27, 5, 27, 5, 27, 5, 27, 5, 28, 4, 28, 4, 28, 4, 28, 4, +29, 3, 29, 3, 29, 3, 29, 3, 30, 2, 30, 2, 30, 2, 30, 2, +31, 1, 31, 1, 31, 1, 31, 1, 32, 0, 32, 0, 32, 0, 32, 0, + 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 20 + 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, +10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, +14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, +18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, +22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, +26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, +30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, + 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 21 + 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, +15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, +21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, +27, 5, 27, 5, 27, 5, 27, 5, 30, 2, 30, 2, 30, 2, 30, 2, + 1, 31, 1, 31, 1, 31, 1, 31, 4, 28, 4, 28, 4, 28, 4, 28, + 7, 25, 7, 25, 7, 25, 7, 25, 10, 22, 10, 22, 10, 22, 10, 22, +13, 19, 13, 19, 13, 19, 13, 19, 16, 16, 16, 16, 16, 16, 16, 16, +19, 13, 19, 13, 19, 13, 19, 13, 22, 10, 22, 10, 22, 10, 22, 10, +25, 7, 25, 7, 25, 7, 25, 7, 28, 4, 28, 4, 28, 4, 28, 4, +31, 1, 31, 1, 31, 1, 31, 1, 2, 30, 2, 30, 2, 30, 2, 30, + 5, 27, 5, 27, 5, 27, 5, 27, 8, 24, 8, 24, 8, 24, 8, 24, +11, 21, 11, 21, 11, 21, 11, 21, 14, 18, 14, 18, 14, 18, 14, 18, +17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, +23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, +29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, + 4, 28, 4, 28, 4, 28, 4, 28, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 22 +12, 20, 12, 20, 12, 20, 12, 20, 16, 16, 16, 16, 16, 16, 16, 16, +20, 12, 20, 12, 20, 12, 20, 12, 24, 8, 24, 8, 24, 8, 24, 8, +28, 4, 28, 4, 28, 4, 28, 4, 32, 0, 32, 0, 32, 0, 32, 0, + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 23 +18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, +30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, +10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, +22, 10, 22, 10, 22, 10, 22, 10, 28, 4, 28, 4, 28, 4, 28, 4, + 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, +14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, +26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, + 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 24 +24, 8, 24, 8, 24, 8, 24, 8, 32, 0, 32, 0, 32, 0, 32, 0, +10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 25 +30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, +18, 14, 18, 14, 18, 14, 18, 14, 28, 4, 28, 4, 28, 4, 28, 4, + 6, 26, 6, 26, 6, 26, 6, 26, 16, 16, 16, 16, 16, 16, 16, 16, +26, 6, 26, 6, 26, 6, 26, 6, 4, 28, 4, 28, 4, 28, 4, 28, +14, 18, 14, 18, 14, 18, 14, 18, 24, 8, 24, 8, 24, 8, 24, 8, + 2, 30, 2, 30, 2, 30, 2, 30, 12, 20, 12, 20, 12, 20, 12, 20, +22, 10, 22, 10, 22, 10, 22, 10, 32, 0, 32, 0, 32, 0, 32, 0, +12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 26 + 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, +28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, +20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, +14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 27 +10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, + 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, + 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, +30, 2, 30, 2, 30, 2, 30, 2, 12, 20, 12, 20, 12, 20, 12, 20, +26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, +22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, +18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, +16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 28 +18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 29 +22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, +26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, +30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, + 2, 30, 2, 30, 2, 30, 2, 30, 20, 12, 20, 12, 20, 12, 20, 12, + 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, +10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, +14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, +20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 30 +28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, + 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, +12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, +23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode 31 + 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, +19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, + 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, +15, 17, 15, 17, 15, 17, 15, 17, 6, 26, 6, 26, 6, 26, 6, 26, +29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, +11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, +25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, + 7, 25, 7, 25, 7, 25, 7, 25, 30, 2, 30, 2, 30, 2, 30, 2, +21, 11, 21, 11, 21, 11, 21, 11, 12, 20, 12, 20, 12, 20, 12, 20, + 3, 29, 3, 29, 3, 29, 3, 29, 26, 6, 26, 6, 26, 6, 26, 6, +17, 15, 17, 15, 17, 15, 17, 15, 8, 24, 8, 24, 8, 24, 8, 24, +31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, +13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, +27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, + 9, 23, 9, 23, 9, 23, 9, 23, 32, 0, 32, 0, 32, 0, 32, 0, +26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 32 +14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, +22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, +10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, +30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, +18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, + 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, +29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 33 +23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, +17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, +11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, +31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, +25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, +19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, +13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, + 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, + 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, +27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, +21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, +15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, + 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, + 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +}; + #endif INTRA_AVX2_TABLES_H From f220c5318a651788b52f735f957341162ecb0b9c Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 25 Jan 2024 23:33:34 +0200 Subject: [PATCH 062/237] Implement intra chroma linear interpolation w4 coefficient handling. --- src/strategies/avx2/intra-avx2.c | 18 ++++-- src/strategies/avx2/intra_avx2_tables.h | 73 ++++++++++++++----------- 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7751deba..84556dbc 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1348,15 +1348,19 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re const __m128i vsub = _mm_set1_epi16(0x0020); // 32 and 0 as 8-bit signed integers // TODO: Different modes have differing amounts of unique coefficient vectors. Make an offset table, indexed by mode to get correct table offset. - const int table_offset = wide_angle_mode ? 0 : (pred_mode <= 34 ? (pred_mode - 2) * 16 : (66 - pred_mode) * 16); - + const int mode_idx = wide_angle_mode ? 0 : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int table_offset = coeff_table_mode_offsets[mode_idx]; + const int vnum = coeff_vector128_num_by_mode[mode_idx]; + int offset_num = 0; // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { + const int offset = table_offset + (offset_num * 16); const __m256i vidx = _mm256_setr_epi64x(dint[0]+1, dint[1]+1, dint[2]+1, dint[3]+1); dint += 4; - const __m128i vcoeff = _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_w4[table_offset]); + const __m128i vcoeff0 = _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_w4[offset]); + const __m128i vcoeff1 = vnum == 1 ? vcoeff0 : _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_w4[offset + 16]); __m256i vsrc; vsrc = _mm256_i64gather_epi64((const long long int*)ref, vidx, 1); @@ -1365,8 +1369,8 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re __m128i vsrc0 = _mm256_extracti128_si256(vsrc, 0); __m128i vsrc1 = _mm256_extracti128_si256(vsrc, 1); - __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); - __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); @@ -1374,6 +1378,10 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); dst += 16; + offset_num += 2; + if (offset_num >= vnum) { + offset_num = 0; + } } } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index d595de2a..ff4a4018 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -14,10 +14,17 @@ const int8_t coeff_vector128_num_by_mode[33] = { 1, 16, 8, 16, 4, 8, 2, 8, 4, 8, 1, 8, 4, 16, 8, 16, 1 }; + +const int16_t coeff_table_mode_offsets[33] = { + 0, 16, 272, 400, 656, 720, 848, 864, 992, 1056, 1184, 1216, 1344, 1408, 1664, 1792, + 2048, 2064, 2320, 2448, 2704, 2768, 2896, 2928, 3056, 3120, 3248, 3264, 3392, 3456, 3712, 3840, 4096 +}; + + // Chroma linear interpolation coefficients for width 4. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 - 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 + 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 Offset 16 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, 15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, 21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, @@ -33,7 +40,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, 23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, 29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, - 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 4 + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 4 Offset 272 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, @@ -41,7 +48,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, 14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, 26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, - 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 5 + 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 5 Offset 400 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, @@ -57,11 +64,11 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 19, 13, 19, 13, 19, 13, 19, 13, 28, 4, 28, 4, 28, 4, 28, 4, 5, 27, 5, 27, 5, 27, 5, 27, 14, 18, 14, 18, 14, 18, 14, 18, 23, 9, 23, 9, 23, 9, 23, 9, 32, 0, 32, 0, 32, 0, 32, 0, -12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 6 +12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 6 Offset 656 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, 28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, 20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, -14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 7 +14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 7 Offset 720 10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, @@ -69,8 +76,8 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, 22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, 18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, -16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 8 -18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 9 +16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 8 Offset 848 +18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 9 Offset 864 22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, 26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, 30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, @@ -78,11 +85,11 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, 10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, 14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, -20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 10 +20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 10 Offset 992 28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, 12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, -22, 10, 22, 10, 22, 10, 22, 10, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 11 +22, 10, 22, 10, 22, 10, 22, 10, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 11 Offset 1056 2, 30, 2, 30, 2, 30, 2, 30, 24, 8, 24, 8, 24, 8, 24, 8, 14, 18, 14, 18, 14, 18, 14, 18, 4, 28, 4, 28, 4, 28, 4, 28, 26, 6, 26, 6, 26, 6, 26, 6, 16, 16, 16, 16, 16, 16, 16, 16, @@ -90,9 +97,9 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 18, 14, 18, 14, 18, 14, 18, 14, 8, 24, 8, 24, 8, 24, 8, 24, 30, 2, 30, 2, 30, 2, 30, 2, 20, 12, 20, 12, 20, 12, 20, 12, 10, 22, 10, 22, 10, 22, 10, 22, 32, 0, 32, 0, 32, 0, 32, 0, -24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 12 +24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 12 Offset 1184 8, 24, 8, 24, 8, 24, 8, 24, 32, 0, 32, 0, 32, 0, 32, 0, -26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 13 +26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 13 Offset 1216 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, 22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, @@ -100,11 +107,11 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, 18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, -28, 4, 28, 4, 28, 4, 28, 4, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 14 +28, 4, 28, 4, 28, 4, 28, 4, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 14 Offset 1344 20, 12, 20, 12, 20, 12, 20, 12, 16, 16, 16, 16, 16, 16, 16, 16, 12, 20, 12, 20, 12, 20, 12, 20, 8, 24, 8, 24, 8, 24, 8, 24, 4, 28, 4, 28, 4, 28, 4, 28, 32, 0, 32, 0, 32, 0, 32, 0, -29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 15 +29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 15 Offset 1408 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, 17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, 11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, @@ -120,7 +127,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, -30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 16 +30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 16 Offset 1664 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, @@ -128,7 +135,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 32, 0, 32, 0, 32, 0, 32, 0, -31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, // Mode 17 +31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, // Mode 17 Offset 1792 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, @@ -144,8 +151,8 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 1, 31, 1, 31, 1, 31, 1, 31, 32, 0, 32, 0, 32, 0, 32, 0, -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 - 1, 31, 1, 31, 1, 31, 1, 31, 2, 30, 2, 30, 2, 30, 2, 30, // Mode 19 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 Offset 2048 + 1, 31, 1, 31, 1, 31, 1, 31, 2, 30, 2, 30, 2, 30, 2, 30, // Mode 19 Offset 2064 3, 29, 3, 29, 3, 29, 3, 29, 4, 28, 4, 28, 4, 28, 4, 28, 5, 27, 5, 27, 5, 27, 5, 27, 6, 26, 6, 26, 6, 26, 6, 26, 7, 25, 7, 25, 7, 25, 7, 25, 8, 24, 8, 24, 8, 24, 8, 24, @@ -161,7 +168,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 27, 5, 27, 5, 27, 5, 27, 5, 28, 4, 28, 4, 28, 4, 28, 4, 29, 3, 29, 3, 29, 3, 29, 3, 30, 2, 30, 2, 30, 2, 30, 2, 31, 1, 31, 1, 31, 1, 31, 1, 32, 0, 32, 0, 32, 0, 32, 0, - 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 20 + 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 20 Offset 2320 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, @@ -169,7 +176,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, - 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 21 + 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 21 Offset 2448 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, 15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, 21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, @@ -185,11 +192,11 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, 23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, 29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, - 4, 28, 4, 28, 4, 28, 4, 28, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 22 + 4, 28, 4, 28, 4, 28, 4, 28, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 22 Offset 2704 12, 20, 12, 20, 12, 20, 12, 20, 16, 16, 16, 16, 16, 16, 16, 16, 20, 12, 20, 12, 20, 12, 20, 12, 24, 8, 24, 8, 24, 8, 24, 8, 28, 4, 28, 4, 28, 4, 28, 4, 32, 0, 32, 0, 32, 0, 32, 0, - 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 23 + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 23 Offset 2768 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, @@ -197,9 +204,9 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, 14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, 26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, - 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 24 + 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 24 Offset 2896 24, 8, 24, 8, 24, 8, 24, 8, 32, 0, 32, 0, 32, 0, 32, 0, -10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 25 +10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 25 Offset 2928 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, 18, 14, 18, 14, 18, 14, 18, 14, 28, 4, 28, 4, 28, 4, 28, 4, 6, 26, 6, 26, 6, 26, 6, 26, 16, 16, 16, 16, 16, 16, 16, 16, @@ -207,11 +214,11 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 14, 18, 14, 18, 14, 18, 14, 18, 24, 8, 24, 8, 24, 8, 24, 8, 2, 30, 2, 30, 2, 30, 2, 30, 12, 20, 12, 20, 12, 20, 12, 20, 22, 10, 22, 10, 22, 10, 22, 10, 32, 0, 32, 0, 32, 0, 32, 0, -12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 26 +12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 26 Offset 3056 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, 28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, 20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, -14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 27 +14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 27 Offset 3120 10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, @@ -219,8 +226,8 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, 22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, 18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, -16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 28 -18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 29 +16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 28 Offset 3248 +18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 29 Offset 3264 22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, 26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, 30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, @@ -228,11 +235,11 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, 10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, 14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, -20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 30 +20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 30 Offset 3392 28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, 12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, -23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode 31 +23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode 31 Offset 3456 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, @@ -248,7 +255,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 9, 23, 9, 23, 9, 23, 9, 23, 32, 0, 32, 0, 32, 0, 32, 0, -26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 32 +26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 32 Offset 3712 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, 22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, @@ -256,7 +263,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, 18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, -29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 33 +29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 33 Offset 3840 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, 17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, 11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, @@ -272,7 +279,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { 15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 Offset 4096 }; #endif INTRA_AVX2_TABLES_H From 0202e5a31f7e69a16b3aa66b4433cb1395986059 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 26 Jan 2024 15:49:21 +0200 Subject: [PATCH 063/237] Implement intra avx2 chroma linear interpolation for horizontal modes for all block widths. --- src/strategies/avx2/intra-avx2.c | 186 +++++++++++++++++++++++++++++-- 1 file changed, 176 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 84556dbc..fbb5202e 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1345,12 +1345,11 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c ); - const __m128i vsub = _mm_set1_epi16(0x0020); // 32 and 0 as 8-bit signed integers - // TODO: Different modes have differing amounts of unique coefficient vectors. Make an offset table, indexed by mode to get correct table offset. const int mode_idx = wide_angle_mode ? 0 : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); const int table_offset = coeff_table_mode_offsets[mode_idx]; const int vnum = coeff_vector128_num_by_mode[mode_idx]; + const int modulo = vnum - 1; int offset_num = 0; // Height has to be at least 4, handle 4 lines at once @@ -1379,9 +1378,8 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); dst += 16; offset_num += 2; - if (offset_num >= vnum) { - offset_num = 0; - } + // This resets the offset number to 0 when it reaches the end of the table. Only works on powers of 2. + offset_num &= modulo; } } @@ -1498,6 +1496,174 @@ static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* r } +static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; + const __m128i v16s = _mm_set1_epi16(16); + + int8_t tmp_coeff[16]; + for (int x = 0, offset = 0; x < 4; ++x, offset += 2) { + tmp_coeff[offset + 0] = 32 - dfract[x]; + tmp_coeff[offset + 1] = dfract[x]; + tmp_coeff[8 + offset + 0] = 32 - dfract[x]; + tmp_coeff[8 + offset + 1] = dfract[x]; + } + __m128i* vcoeff = (__m128i*) &tmp_coeff[0]; + + // Height has to be at least 4, handle 4 lines at once + for (int y = 0; y < height; y += 4) { + // TODO: find a more efficient way to do this + uvg_pixel src[32]; + for (int yy = 0; yy < 4; ++yy) { + for (int x = 0, offset = 0; x < 4; ++x, offset += 2) { + const int ref_offset = dint[x] + y + yy + 1; + src[yy * 8 + offset + 0] = ref[ref_offset + 0]; + src[yy * 8 + offset + 1] = ref[ref_offset + 1]; + } + } + __m128i* vsrc0 = (__m128i*)&src[0]; + __m128i* vsrc1 = (__m128i*)&src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, *vcoeff); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, *vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; + const __m128i v16s = _mm_set1_epi16(16); + + int8_t tmp_coeff[16]; + for (int x = 0, offset = 0; x < 8; ++x, offset += 2) { + tmp_coeff[offset + 0] = 32 - dfract[x]; + tmp_coeff[offset + 1] = dfract[x]; + } + __m128i* vcoeff = (__m128i*) &tmp_coeff[0]; + + // Height has to be at least 2, handle 2 lines at once + for (int y = 0; y < height; y += 2) { + // TODO: find a more efficient way to do this + uvg_pixel src[32]; + for (int yy = 0; yy < 2; ++yy) { + for (int x = 0, offset = 0; x < 8; ++x, offset += 2) { + const int ref_offset = dint[x] + y + yy + 1; + src[yy * 16 + offset + 0] = ref[ref_offset + 0]; + src[yy * 16 + offset + 1] = ref[ref_offset + 1]; + } + } + + __m128i* vsrc0 = (__m128i*) & src[0]; + __m128i* vsrc1 = (__m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, *vcoeff); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, *vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; + const __m128i v16s = _mm_set1_epi16(16); + + int8_t tmp_coeff[32]; + for (int x = 0, offset = 0; x < 16; ++x, offset += 2) { + tmp_coeff[offset + 0] = 32 - dfract[x]; + tmp_coeff[offset + 1] = dfract[x]; + } + __m128i* vcoeff0 = (__m128i*) &tmp_coeff[0]; + __m128i* vcoeff1 = (__m128i*) &tmp_coeff[16]; + + // Height has to be at least 1, handle 1 line at a time + for (int y = 0; y < height; ++y) { + // TODO: find a more efficient way to do this + uvg_pixel src[32]; + for (int x = 0, offset = 0; x < 16; ++x, offset += 2) { + const int ref_offset = dint[x] + y + 1; + src[offset + 0] = ref[ref_offset + 0]; + src[offset + 1] = ref[ref_offset + 1]; + } + + __m128i* vsrc0 = (__m128i*) & src[0]; + __m128i* vsrc1 = (__m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, *vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, *vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int16_t* dint = delta_int; + const int16_t* dfract = delta_fract; + const __m256i v16s = _mm256_set1_epi16(16); + + int8_t tmp_coeff[64]; + for (int x = 0, offset = 0; x < 32; ++x, offset += 2) { + tmp_coeff[offset + 0] = 32 - dfract[x]; + tmp_coeff[offset + 1] = dfract[x]; + } + __m256i* vcoeff0 = (__m256i*) &tmp_coeff[0]; + __m256i* vcoeff1 = (__m256i*) &tmp_coeff[32]; + + // Height has to be at least 1, handle 1 line at a time + for (int y = 0; y < height; ++y) { + // TODO: find a more efficient way to do this + uvg_pixel src[64]; + for (int x = 0, offset = 0; x < 32; ++x, offset += 2) { + const int ref_offset = dint[x] + y + 1; + src[offset + 0] = ref[ref_offset + 0]; + src[offset + 1] = ref[ref_offset + 1]; + } + + + __m256i* vsrc0 = (__m256i*) &src[0]; + __m256i* vsrc1 = (__m256i*) &src[32]; + + __m256i res0 = _mm256_maddubs_epi16(*vsrc0, *vcoeff0); + __m256i res1 = _mm256_maddubs_epi16(*vsrc1, *vcoeff1); + res0 = _mm256_add_epi16(res0, v16s); + res1 = _mm256_add_epi16(res1, v16s); + res0 = _mm256_srai_epi16(res0, 5); + res1 = _mm256_srai_epi16(res1, 5); + //res0 = _mm256_permute4x64_epi64(res0, _MM_SHUFFLE(3, 1, 2, 0)); + //res1 = _mm256_permute4x64_epi64(res1, _MM_SHUFFLE(3, 1, 2, 0)); + __m256i res_final = _mm256_packus_epi16(res0, res1); + res_final = _mm256_permute4x64_epi64(res_final, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, res_final); + dst += 32; + } +} + + static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { // 2-tap linear filter @@ -2237,11 +2403,11 @@ static void uvg_angular_pred_avx2( } } else { - switch (width) { // TODO: this generic solution does not work for horizontal modes. Start by implementing the vertical prediction first - case 4: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; - case 8: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; - case 16: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; - case 32: angular_pred_generic_linear_filter(dst, ref_main, width, height, delta_int, delta_fract); break; + switch (width) { + case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From b5f37076ceb4d02d6e183eb7bf715c70fc1a8594 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 29 Jan 2024 22:12:44 +0200 Subject: [PATCH 064/237] Implement faster version of intra avx2 chroma linear interpolation for horizontal w4. Handle reference sample memory management with gathers and shuffle vectors fetched from memory. This version is hard coded for mode 30 for speed testing purposes. --- src/strategies/avx2/intra-avx2.c | 31 +++++++++++++++++++------ src/strategies/avx2/intra_avx2_tables.h | 8 +++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index fbb5202e..7a43502d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1502,6 +1502,9 @@ static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* re const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); + // TODO: hard coded some stuff to test mode 30 with new faster system + + // TODO: fetch coeffs (filter weights) from table instead of constructing int8_t tmp_coeff[16]; for (int x = 0, offset = 0; x < 4; ++x, offset += 2) { tmp_coeff[offset + 0] = 32 - dfract[x]; @@ -1511,10 +1514,23 @@ static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* re } __m128i* vcoeff = (__m128i*) &tmp_coeff[0]; + __m128i vshuf[2]; + vshuf[0] = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_w4_m30[0]); + vshuf[1] = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_w4_m30[16]); + + // Prepare sources + const int16_t min_offset = 1 + MIN(dint[0], dint[3]); + __m128i vsrc[16]; + for (int y = 0, d = 0; y < height; y += 4, d += 2) { + __m128i vidx = _mm_set_epi64x((long long int)(min_offset + y + 2), (long long int)(min_offset + y + 0)); + __m128i vsrc_tmp = _mm_i64gather_epi64((const long long*)ref, vidx, 1); + vsrc[d + 0] = _mm_shuffle_epi8(vsrc_tmp, vshuf[0]); + vsrc[d + 1] = _mm_shuffle_epi8(vsrc_tmp, vshuf[1]); + } + // Height has to be at least 4, handle 4 lines at once - for (int y = 0; y < height; y += 4) { - // TODO: find a more efficient way to do this - uvg_pixel src[32]; + for (int y = 0, s = 0; y < height; y += 4, s += 2) { + /*uvg_pixel src[32]; for (int yy = 0; yy < 4; ++yy) { for (int x = 0, offset = 0; x < 4; ++x, offset += 2) { const int ref_offset = dint[x] + y + yy + 1; @@ -1522,11 +1538,12 @@ static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* re src[yy * 8 + offset + 1] = ref[ref_offset + 1]; } } - __m128i* vsrc0 = (__m128i*)&src[0]; - __m128i* vsrc1 = (__m128i*)&src[16]; - __m128i res0 = _mm_maddubs_epi16(*vsrc0, *vcoeff); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, *vcoeff); + __m128i* vsrc0 = (__m128i*) & src[0]; + __m128i* vsrc1 = (__m128i*) & src[16];*/ + + __m128i res0 = _mm_maddubs_epi16(vsrc[s + 0], *vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc[s + 1], *vcoeff); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index ff4a4018..3a53512a 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -8,6 +8,14 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m40[] = { 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, }; +// Another test table +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_w4_m30[] = { + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a +}; + // The number of unique 128-bit coefficient vectors for a given prediction mode. Applicable for width 4 chroma linear interpolation. const int8_t coeff_vector128_num_by_mode[33] = { 1, 16, 8, 16, 4, 8, 1, 8, 4, 8, 2, 8, 4, 16, 8, 16, From 0996f4ecdf2ce81559d1536ed9f1268f0be2ae07 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 31 Jan 2024 01:08:26 +0200 Subject: [PATCH 065/237] Remove mode 30 hard coding. Add shuffle vector table and proper table indexing. --- src/strategies/avx2/intra-avx2.c | 62 +++----- src/strategies/avx2/intra_avx2_tables.h | 193 +++++++++++++++++++++++- 2 files changed, 210 insertions(+), 45 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7a43502d..5882b668 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1358,8 +1358,8 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re const __m256i vidx = _mm256_setr_epi64x(dint[0]+1, dint[1]+1, dint[2]+1, dint[3]+1); dint += 4; - const __m128i vcoeff0 = _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_w4[offset]); - const __m128i vcoeff1 = vnum == 1 ? vcoeff0 : _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_w4[offset + 16]); + const __m128i vcoeff0 = _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_weights_w4_ver[offset]); + const __m128i vcoeff1 = vnum == 1 ? vcoeff0 : _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_weights_w4_ver[offset + 16]); __m256i vsrc; vsrc = _mm256_i64gather_epi64((const long long int*)ref, vidx, 1); @@ -1496,54 +1496,34 @@ static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); - // TODO: hard coded some stuff to test mode 30 with new faster system + const int16_t weigth_offset = mode_to_weight_table_offset_w4_hor[mode]; + const int16_t shuf_offset = mode_to_shuffle_vector_table_offset_w4_hor[mode]; - // TODO: fetch coeffs (filter weights) from table instead of constructing - int8_t tmp_coeff[16]; - for (int x = 0, offset = 0; x < 4; ++x, offset += 2) { - tmp_coeff[offset + 0] = 32 - dfract[x]; - tmp_coeff[offset + 1] = dfract[x]; - tmp_coeff[8 + offset + 0] = 32 - dfract[x]; - tmp_coeff[8 + offset + 1] = dfract[x]; - } - __m128i* vcoeff = (__m128i*) &tmp_coeff[0]; + __m128i vkek0 = _mm_load_si128((const __m128i*) intra_chroma_linear_interpolation_shuffle_w4_m30); + __m128i vkek1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_w4_m30[16]); - __m128i vshuf[2]; - vshuf[0] = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_w4_m30[0]); - vshuf[1] = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_w4_m30[16]); + __m128i vcoeff = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w4_hor[weigth_offset]); + __m128i vshuf0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[shuf_offset + 0]); + __m128i vshuf1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[shuf_offset + 16]); - // Prepare sources + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices const int16_t min_offset = 1 + MIN(dint[0], dint[3]); - __m128i vsrc[16]; - for (int y = 0, d = 0; y < height; y += 4, d += 2) { - __m128i vidx = _mm_set_epi64x((long long int)(min_offset + y + 2), (long long int)(min_offset + y + 0)); - __m128i vsrc_tmp = _mm_i64gather_epi64((const long long*)ref, vidx, 1); - vsrc[d + 0] = _mm_shuffle_epi8(vsrc_tmp, vshuf[0]); - vsrc[d + 1] = _mm_shuffle_epi8(vsrc_tmp, vshuf[1]); - } // Height has to be at least 4, handle 4 lines at once - for (int y = 0, s = 0; y < height; y += 4, s += 2) { - /*uvg_pixel src[32]; - for (int yy = 0; yy < 4; ++yy) { - for (int x = 0, offset = 0; x < 4; ++x, offset += 2) { - const int ref_offset = dint[x] + y + yy + 1; - src[yy * 8 + offset + 0] = ref[ref_offset + 0]; - src[yy * 8 + offset + 1] = ref[ref_offset + 1]; - } - } - - __m128i* vsrc0 = (__m128i*) & src[0]; - __m128i* vsrc1 = (__m128i*) & src[16];*/ - - __m128i res0 = _mm_maddubs_epi16(vsrc[s + 0], *vcoeff); - __m128i res1 = _mm_maddubs_epi16(vsrc[s + 1], *vcoeff); + for (int y = 0; y < height; y += 4) { + // Prepare sources + __m128i vidx = _mm_set_epi64x((long long int)(min_offset + y + 2), (long long int)(min_offset + y + 0)); + __m128i vsrc_tmp = _mm_i64gather_epi64((const long long*)ref, vidx, 1); + __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp, vshuf0); + __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp, vshuf1); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); @@ -2421,7 +2401,7 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, delta_int, delta_fract); break; case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, delta_int, delta_fract); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 3a53512a..4f7ba9ce 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3,12 +3,11 @@ #include "global.h" -// Test table +// Test tables ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m40[] = { 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, }; -// Another test table ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_w4_m30[] = { 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, @@ -16,6 +15,10 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_w4_m30[] = { 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a }; +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m30_coeff[] = { + 20, 12, 8, 24, 28, 04, 16, 16, 20, 12, 8, 24, 28, 04, 16, 16, +}; + // The number of unique 128-bit coefficient vectors for a given prediction mode. Applicable for width 4 chroma linear interpolation. const int8_t coeff_vector128_num_by_mode[33] = { 1, 16, 8, 16, 4, 8, 1, 8, 4, 8, 2, 8, 4, 16, 8, 16, @@ -28,9 +31,191 @@ const int16_t coeff_table_mode_offsets[33] = { 2048, 2064, 2320, 2448, 2704, 2768, 2896, 2928, 3056, 3120, 3248, 3264, 3392, 3456, 3712, 3840, 4096 }; +ALIGNED(32) const int16_t mode_to_weight_table_offset_w4_hor[35] = { + 0, 0, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512 +}; + +ALIGNED(32) const int16_t mode_to_shuffle_vector_table_offset_w4_hor[35] = { + 0, 0, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024 +}; + + +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[] = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 2 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 3 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 4 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, // Mode 5 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, + 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 6 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 7 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 8 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, // Mode 9 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, + 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, + 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, // Mode 10 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, + 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, + 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 11 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 12 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 13 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 14 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 15 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 16 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 17 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 18 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 19 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 20 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 21 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 22 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 23 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 24 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 25 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x09, 0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 26 + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 27 + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 28 + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 29 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 30 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, + 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, + 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 31 + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 32 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a +}; + + +// Chroma linear interpolation filter weights for width 4, horizontal modes +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 9, 23, 18, 14, 27, 5, 4, 28, // Mode 5 +12, 20, 24, 8, 4, 28, 16, 16, 12, 20, 24, 8, 4, 28, 16, 16, // Mode 6 +14, 18, 28, 4, 10, 22, 24, 8, 14, 18, 28, 4, 10, 22, 24, 8, // Mode 7 +16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 +18, 14, 4, 28, 22, 10, 8, 24, 18, 14, 4, 28, 22, 10, 8, 24, // Mode 9 +20, 12, 8, 24, 28, 4, 16, 16, 20, 12, 8, 24, 28, 4, 16, 16, // Mode 10 +22, 10, 12, 20, 2, 30, 24, 8, 22, 10, 12, 20, 2, 30, 24, 8, // Mode 11 +24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 +26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 13 +28, 4, 24, 8, 20, 12, 16, 16, 28, 4, 24, 8, 20, 12, 16, 16, // Mode 14 +29, 3, 26, 6, 23, 9, 20, 12, 29, 3, 26, 6, 23, 9, 20, 12, // Mode 15 +30, 2, 28, 4, 26, 6, 24, 8, 30, 2, 28, 4, 26, 6, 24, 8, // Mode 16 +31, 1, 30, 2, 29, 3, 28, 4, 31, 1, 30, 2, 29, 3, 28, 4, // Mode 17 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 1, 31, 2, 30, 3, 29, 4, 28, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 2, 30, 4, 28, 6, 26, 8, 24, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 4, 28, 8, 24, 12, 20, 16, 16, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 +10, 22, 20, 12, 30, 2, 8, 24, 10, 22, 20, 12, 30, 2, 8, 24, // Mode 25 +12, 20, 24, 8, 4, 28, 16, 16, 12, 20, 24, 8, 4, 28, 16, 16, // Mode 26 +14, 18, 28, 4, 10, 22, 24, 8, 14, 18, 28, 4, 10, 22, 24, 8, // Mode 27 +16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 +18, 14, 4, 28, 22, 10, 8, 24, 18, 14, 4, 28, 22, 10, 8, 24, // Mode 29 +20, 12, 8, 24, 28, 4, 16, 16, 20, 12, 8, 24, 28, 4, 16, 16, // Mode 30 +23, 9, 14, 18, 5, 27, 28, 4, 23, 9, 14, 18, 5, 27, 28, 4, // Mode 31 +26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 32 +29, 3, 26, 6, 23, 9, 20, 12, 29, 3, 26, 6, 23, 9, 20, 12, // Mode 33 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +}; + -// Chroma linear interpolation coefficients for width 4. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4[4112] = { +// Chroma linear interpolation filter weights for width 4, vertical modes. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 Offset 16 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, From 770e1931456fea2dd7bc6a5ad986e845926a6f32 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 31 Jan 2024 16:28:31 +0200 Subject: [PATCH 066/237] Implement faster version of intra avx2 chroma linear interpolation for horizontal w8. Add weight and shuffle tables for w8. --- src/strategies/avx2/intra-avx2.c | 42 +- src/strategies/avx2/intra_avx2_tables.h | 688 ++++++++++++++---------- 2 files changed, 414 insertions(+), 316 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5882b668..13d4b823 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1504,9 +1504,6 @@ static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* re const int16_t weigth_offset = mode_to_weight_table_offset_w4_hor[mode]; const int16_t shuf_offset = mode_to_shuffle_vector_table_offset_w4_hor[mode]; - __m128i vkek0 = _mm_load_si128((const __m128i*) intra_chroma_linear_interpolation_shuffle_w4_m30); - __m128i vkek1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_w4_m30[16]); - __m128i vcoeff = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w4_hor[weigth_offset]); __m128i vshuf0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[shuf_offset + 0]); __m128i vshuf1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[shuf_offset + 16]); @@ -1535,36 +1532,29 @@ static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 16; + const int16_t shuf_offset = (mode - 2) * 32; - int8_t tmp_coeff[16]; - for (int x = 0, offset = 0; x < 8; ++x, offset += 2) { - tmp_coeff[offset + 0] = 32 - dfract[x]; - tmp_coeff[offset + 1] = dfract[x]; - } - __m128i* vcoeff = (__m128i*) &tmp_coeff[0]; + __m128i vcoeff = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w8_hor[weigth_offset]); + __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w8_hor[shuf_offset + 0]); + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w8_hor[shuf_offset + 16]); + + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset = 1 + MIN(dint[0], dint[7]); // Height has to be at least 2, handle 2 lines at once for (int y = 0; y < height; y += 2) { - // TODO: find a more efficient way to do this - uvg_pixel src[32]; - for (int yy = 0; yy < 2; ++yy) { - for (int x = 0, offset = 0; x < 8; ++x, offset += 2) { - const int ref_offset = dint[x] + y + yy + 1; - src[yy * 16 + offset + 0] = ref[ref_offset + 0]; - src[yy * 16 + offset + 1] = ref[ref_offset + 1]; - } - } - - __m128i* vsrc0 = (__m128i*) & src[0]; - __m128i* vsrc1 = (__m128i*) & src[16]; + // Prepare sources + __m128i vsrc_tmp = _mm_loadu_si128((__m128i*)&ref[min_offset + y]); + const __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp, vshuf0); + const __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp, vshuf1); - __m128i res0 = _mm_maddubs_epi16(*vsrc0, *vcoeff); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, *vcoeff); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); @@ -2402,7 +2392,7 @@ static void uvg_angular_pred_avx2( else { switch (width) { case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, pred_mode, delta_int); break; case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, delta_int, delta_fract); break; case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, delta_int, delta_fract); break; default: diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 4f7ba9ce..233057b7 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -176,303 +176,411 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ho }; +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_hor[] = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 2 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 3 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 4 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // Mode 5 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // Mode 6 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 7 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 8 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, // Mode 9 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 10 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, // Mode 11 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 12 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 13 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 14 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 15 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 16 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 17 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 18 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 19 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 20 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 21 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 22 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 23 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 24 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 25 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 26 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 27 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 28 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 29 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 30 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 31 + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 32 + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, +}; + + // Chroma linear interpolation filter weights for width 4, horizontal modes ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 - 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 3 - 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 4 - 9, 23, 18, 14, 27, 5, 4, 28, 9, 23, 18, 14, 27, 5, 4, 28, // Mode 5 -12, 20, 24, 8, 4, 28, 16, 16, 12, 20, 24, 8, 4, 28, 16, 16, // Mode 6 -14, 18, 28, 4, 10, 22, 24, 8, 14, 18, 28, 4, 10, 22, 24, 8, // Mode 7 -16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 -18, 14, 4, 28, 22, 10, 8, 24, 18, 14, 4, 28, 22, 10, 8, 24, // Mode 9 -20, 12, 8, 24, 28, 4, 16, 16, 20, 12, 8, 24, 28, 4, 16, 16, // Mode 10 -22, 10, 12, 20, 2, 30, 24, 8, 22, 10, 12, 20, 2, 30, 24, 8, // Mode 11 -24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 -26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 13 -28, 4, 24, 8, 20, 12, 16, 16, 28, 4, 24, 8, 20, 12, 16, 16, // Mode 14 -29, 3, 26, 6, 23, 9, 20, 12, 29, 3, 26, 6, 23, 9, 20, 12, // Mode 15 -30, 2, 28, 4, 26, 6, 24, 8, 30, 2, 28, 4, 26, 6, 24, 8, // Mode 16 -31, 1, 30, 2, 29, 3, 28, 4, 31, 1, 30, 2, 29, 3, 28, 4, // Mode 17 -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 - 1, 31, 2, 30, 3, 29, 4, 28, 1, 31, 2, 30, 3, 29, 4, 28, // Mode 19 - 2, 30, 4, 28, 6, 26, 8, 24, 2, 30, 4, 28, 6, 26, 8, 24, // Mode 20 - 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 21 - 4, 28, 8, 24, 12, 20, 16, 16, 4, 28, 8, 24, 12, 20, 16, 16, // Mode 22 - 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 23 - 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 -10, 22, 20, 12, 30, 2, 8, 24, 10, 22, 20, 12, 30, 2, 8, 24, // Mode 25 -12, 20, 24, 8, 4, 28, 16, 16, 12, 20, 24, 8, 4, 28, 16, 16, // Mode 26 -14, 18, 28, 4, 10, 22, 24, 8, 14, 18, 28, 4, 10, 22, 24, 8, // Mode 27 -16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 -18, 14, 4, 28, 22, 10, 8, 24, 18, 14, 4, 28, 22, 10, 8, 24, // Mode 29 -20, 12, 8, 24, 28, 4, 16, 16, 20, 12, 8, 24, 28, 4, 16, 16, // Mode 30 -23, 9, 14, 18, 5, 27, 28, 4, 23, 9, 14, 18, 5, 27, 28, 4, // Mode 31 -26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 32 -29, 3, 26, 6, 23, 9, 20, 12, 29, 3, 26, 6, 23, 9, 20, 12, // Mode 33 -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 9, 23, 18, 14, 27, 5, 4, 28, // Mode 5 + 12, 20, 24, 8, 4, 28, 16, 16, 12, 20, 24, 8, 4, 28, 16, 16, // Mode 6 + 14, 18, 28, 4, 10, 22, 24, 8, 14, 18, 28, 4, 10, 22, 24, 8, // Mode 7 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 + 18, 14, 4, 28, 22, 10, 8, 24, 18, 14, 4, 28, 22, 10, 8, 24, // Mode 9 + 20, 12, 8, 24, 28, 4, 16, 16, 20, 12, 8, 24, 28, 4, 16, 16, // Mode 10 + 22, 10, 12, 20, 2, 30, 24, 8, 22, 10, 12, 20, 2, 30, 24, 8, // Mode 11 + 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 + 26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 13 + 28, 4, 24, 8, 20, 12, 16, 16, 28, 4, 24, 8, 20, 12, 16, 16, // Mode 14 + 29, 3, 26, 6, 23, 9, 20, 12, 29, 3, 26, 6, 23, 9, 20, 12, // Mode 15 + 30, 2, 28, 4, 26, 6, 24, 8, 30, 2, 28, 4, 26, 6, 24, 8, // Mode 16 + 31, 1, 30, 2, 29, 3, 28, 4, 31, 1, 30, 2, 29, 3, 28, 4, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 1, 31, 2, 30, 3, 29, 4, 28, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 2, 30, 4, 28, 6, 26, 8, 24, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 4, 28, 8, 24, 12, 20, 16, 16, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 + 10, 22, 20, 12, 30, 2, 8, 24, 10, 22, 20, 12, 30, 2, 8, 24, // Mode 25 + 12, 20, 24, 8, 4, 28, 16, 16, 12, 20, 24, 8, 4, 28, 16, 16, // Mode 26 + 14, 18, 28, 4, 10, 22, 24, 8, 14, 18, 28, 4, 10, 22, 24, 8, // Mode 27 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 + 18, 14, 4, 28, 22, 10, 8, 24, 18, 14, 4, 28, 22, 10, 8, 24, // Mode 29 + 20, 12, 8, 24, 28, 4, 16, 16, 20, 12, 8, 24, 28, 4, 16, 16, // Mode 30 + 23, 9, 14, 18, 5, 27, 28, 4, 23, 9, 14, 18, 5, 27, 28, 4, // Mode 31 + 26, 6, 20, 12, 14, 18, 8, 24, 26, 6, 20, 12, 14, 18, 8, 24, // Mode 32 + 29, 3, 26, 6, 23, 9, 20, 12, 29, 3, 26, 6, 23, 9, 20, 12, // Mode 33 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +}; + + +// Chroma linear interpolation filter weights for width 8, horizontal modes +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, // Mode 5 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, // Mode 7 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, // Mode 9 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 + 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, // Mode 11 + 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, // Mode 13 + 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, // Mode 15 + 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, // Mode 16 + 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 + 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, // Mode 25 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, // Mode 27 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, // Mode 29 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, // Mode 31 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, // Mode 32 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, // Mode 33 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 }; // Chroma linear interpolation filter weights for width 4, vertical modes. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] = { -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 - 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 Offset 16 - 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, -15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, -21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, -27, 5, 27, 5, 27, 5, 27, 5, 30, 2, 30, 2, 30, 2, 30, 2, - 1, 31, 1, 31, 1, 31, 1, 31, 4, 28, 4, 28, 4, 28, 4, 28, - 7, 25, 7, 25, 7, 25, 7, 25, 10, 22, 10, 22, 10, 22, 10, 22, -13, 19, 13, 19, 13, 19, 13, 19, 16, 16, 16, 16, 16, 16, 16, 16, -19, 13, 19, 13, 19, 13, 19, 13, 22, 10, 22, 10, 22, 10, 22, 10, -25, 7, 25, 7, 25, 7, 25, 7, 28, 4, 28, 4, 28, 4, 28, 4, -31, 1, 31, 1, 31, 1, 31, 1, 2, 30, 2, 30, 2, 30, 2, 30, - 5, 27, 5, 27, 5, 27, 5, 27, 8, 24, 8, 24, 8, 24, 8, 24, -11, 21, 11, 21, 11, 21, 11, 21, 14, 18, 14, 18, 14, 18, 14, 18, -17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, -23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, -29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, - 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 4 Offset 272 -18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, -30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, -10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, -22, 10, 22, 10, 22, 10, 22, 10, 28, 4, 28, 4, 28, 4, 28, 4, - 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, -14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, -26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, - 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 5 Offset 400 -27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, -13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, -31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, -17, 15, 17, 15, 17, 15, 17, 15, 26, 6, 26, 6, 26, 6, 26, 6, - 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, -21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, - 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, -25, 7, 25, 7, 25, 7, 25, 7, 2, 30, 2, 30, 2, 30, 2, 30, -11, 21, 11, 21, 11, 21, 11, 21, 20, 12, 20, 12, 20, 12, 20, 12, -29, 3, 29, 3, 29, 3, 29, 3, 6, 26, 6, 26, 6, 26, 6, 26, -15, 17, 15, 17, 15, 17, 15, 17, 24, 8, 24, 8, 24, 8, 24, 8, - 1, 31, 1, 31, 1, 31, 1, 31, 10, 22, 10, 22, 10, 22, 10, 22, -19, 13, 19, 13, 19, 13, 19, 13, 28, 4, 28, 4, 28, 4, 28, 4, - 5, 27, 5, 27, 5, 27, 5, 27, 14, 18, 14, 18, 14, 18, 14, 18, -23, 9, 23, 9, 23, 9, 23, 9, 32, 0, 32, 0, 32, 0, 32, 0, -12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 6 Offset 656 - 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, -28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, -20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, -14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 7 Offset 720 -10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, - 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, - 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, -30, 2, 30, 2, 30, 2, 30, 2, 12, 20, 12, 20, 12, 20, 12, 20, -26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, -22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, -18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, -16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 8 Offset 848 -18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 9 Offset 864 -22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, -26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, -30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, - 2, 30, 2, 30, 2, 30, 2, 30, 20, 12, 20, 12, 20, 12, 20, 12, - 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, -10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, -14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, -20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 10 Offset 992 -28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, - 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, -12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, -22, 10, 22, 10, 22, 10, 22, 10, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 11 Offset 1056 - 2, 30, 2, 30, 2, 30, 2, 30, 24, 8, 24, 8, 24, 8, 24, 8, -14, 18, 14, 18, 14, 18, 14, 18, 4, 28, 4, 28, 4, 28, 4, 28, -26, 6, 26, 6, 26, 6, 26, 6, 16, 16, 16, 16, 16, 16, 16, 16, - 6, 26, 6, 26, 6, 26, 6, 26, 28, 4, 28, 4, 28, 4, 28, 4, -18, 14, 18, 14, 18, 14, 18, 14, 8, 24, 8, 24, 8, 24, 8, 24, -30, 2, 30, 2, 30, 2, 30, 2, 20, 12, 20, 12, 20, 12, 20, 12, -10, 22, 10, 22, 10, 22, 10, 22, 32, 0, 32, 0, 32, 0, 32, 0, -24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 12 Offset 1184 - 8, 24, 8, 24, 8, 24, 8, 24, 32, 0, 32, 0, 32, 0, 32, 0, -26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 13 Offset 1216 -14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, - 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, -22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, -10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, -30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, -18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, - 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, -28, 4, 28, 4, 28, 4, 28, 4, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 14 Offset 1344 -20, 12, 20, 12, 20, 12, 20, 12, 16, 16, 16, 16, 16, 16, 16, 16, -12, 20, 12, 20, 12, 20, 12, 20, 8, 24, 8, 24, 8, 24, 8, 24, - 4, 28, 4, 28, 4, 28, 4, 28, 32, 0, 32, 0, 32, 0, 32, 0, -29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 15 Offset 1408 -23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, -17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, -11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, - 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, -31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, -25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, -19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, -13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, - 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, - 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, -27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, -21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, -15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, - 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, - 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, -30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 16 Offset 1664 -26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, -22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, -18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, -14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, -10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, - 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, - 2, 30, 2, 30, 2, 30, 2, 30, 32, 0, 32, 0, 32, 0, 32, 0, -31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, // Mode 17 Offset 1792 -29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, -27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, -25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, -23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, -21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, -19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, -17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, -15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, -13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, -11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, - 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, - 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, - 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, - 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, - 1, 31, 1, 31, 1, 31, 1, 31, 32, 0, 32, 0, 32, 0, 32, 0, -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 Offset 2048 - 1, 31, 1, 31, 1, 31, 1, 31, 2, 30, 2, 30, 2, 30, 2, 30, // Mode 19 Offset 2064 - 3, 29, 3, 29, 3, 29, 3, 29, 4, 28, 4, 28, 4, 28, 4, 28, - 5, 27, 5, 27, 5, 27, 5, 27, 6, 26, 6, 26, 6, 26, 6, 26, - 7, 25, 7, 25, 7, 25, 7, 25, 8, 24, 8, 24, 8, 24, 8, 24, - 9, 23, 9, 23, 9, 23, 9, 23, 10, 22, 10, 22, 10, 22, 10, 22, -11, 21, 11, 21, 11, 21, 11, 21, 12, 20, 12, 20, 12, 20, 12, 20, -13, 19, 13, 19, 13, 19, 13, 19, 14, 18, 14, 18, 14, 18, 14, 18, -15, 17, 15, 17, 15, 17, 15, 17, 16, 16, 16, 16, 16, 16, 16, 16, -17, 15, 17, 15, 17, 15, 17, 15, 18, 14, 18, 14, 18, 14, 18, 14, -19, 13, 19, 13, 19, 13, 19, 13, 20, 12, 20, 12, 20, 12, 20, 12, -21, 11, 21, 11, 21, 11, 21, 11, 22, 10, 22, 10, 22, 10, 22, 10, -23, 9, 23, 9, 23, 9, 23, 9, 24, 8, 24, 8, 24, 8, 24, 8, -25, 7, 25, 7, 25, 7, 25, 7, 26, 6, 26, 6, 26, 6, 26, 6, -27, 5, 27, 5, 27, 5, 27, 5, 28, 4, 28, 4, 28, 4, 28, 4, -29, 3, 29, 3, 29, 3, 29, 3, 30, 2, 30, 2, 30, 2, 30, 2, -31, 1, 31, 1, 31, 1, 31, 1, 32, 0, 32, 0, 32, 0, 32, 0, - 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 20 Offset 2320 - 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, -10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, -14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, -18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, -22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, -26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, -30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, - 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 21 Offset 2448 - 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, -15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, -21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, -27, 5, 27, 5, 27, 5, 27, 5, 30, 2, 30, 2, 30, 2, 30, 2, - 1, 31, 1, 31, 1, 31, 1, 31, 4, 28, 4, 28, 4, 28, 4, 28, - 7, 25, 7, 25, 7, 25, 7, 25, 10, 22, 10, 22, 10, 22, 10, 22, -13, 19, 13, 19, 13, 19, 13, 19, 16, 16, 16, 16, 16, 16, 16, 16, -19, 13, 19, 13, 19, 13, 19, 13, 22, 10, 22, 10, 22, 10, 22, 10, -25, 7, 25, 7, 25, 7, 25, 7, 28, 4, 28, 4, 28, 4, 28, 4, -31, 1, 31, 1, 31, 1, 31, 1, 2, 30, 2, 30, 2, 30, 2, 30, - 5, 27, 5, 27, 5, 27, 5, 27, 8, 24, 8, 24, 8, 24, 8, 24, -11, 21, 11, 21, 11, 21, 11, 21, 14, 18, 14, 18, 14, 18, 14, 18, -17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, -23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, -29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, - 4, 28, 4, 28, 4, 28, 4, 28, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 22 Offset 2704 -12, 20, 12, 20, 12, 20, 12, 20, 16, 16, 16, 16, 16, 16, 16, 16, -20, 12, 20, 12, 20, 12, 20, 12, 24, 8, 24, 8, 24, 8, 24, 8, -28, 4, 28, 4, 28, 4, 28, 4, 32, 0, 32, 0, 32, 0, 32, 0, - 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 23 Offset 2768 -18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, -30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, -10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, -22, 10, 22, 10, 22, 10, 22, 10, 28, 4, 28, 4, 28, 4, 28, 4, - 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, -14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, -26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, - 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 24 Offset 2896 -24, 8, 24, 8, 24, 8, 24, 8, 32, 0, 32, 0, 32, 0, 32, 0, -10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 25 Offset 2928 -30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, -18, 14, 18, 14, 18, 14, 18, 14, 28, 4, 28, 4, 28, 4, 28, 4, - 6, 26, 6, 26, 6, 26, 6, 26, 16, 16, 16, 16, 16, 16, 16, 16, -26, 6, 26, 6, 26, 6, 26, 6, 4, 28, 4, 28, 4, 28, 4, 28, -14, 18, 14, 18, 14, 18, 14, 18, 24, 8, 24, 8, 24, 8, 24, 8, - 2, 30, 2, 30, 2, 30, 2, 30, 12, 20, 12, 20, 12, 20, 12, 20, -22, 10, 22, 10, 22, 10, 22, 10, 32, 0, 32, 0, 32, 0, 32, 0, -12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 26 Offset 3056 - 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, -28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, -20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, -14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 27 Offset 3120 -10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, - 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, - 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, -30, 2, 30, 2, 30, 2, 30, 2, 12, 20, 12, 20, 12, 20, 12, 20, -26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, -22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, -18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, -16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 28 Offset 3248 -18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 29 Offset 3264 -22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, -26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, -30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, - 2, 30, 2, 30, 2, 30, 2, 30, 20, 12, 20, 12, 20, 12, 20, 12, - 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, -10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, -14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, -20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 30 Offset 3392 -28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, - 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, -12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, -23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode 31 Offset 3456 - 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, -19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, - 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, -15, 17, 15, 17, 15, 17, 15, 17, 6, 26, 6, 26, 6, 26, 6, 26, -29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, -11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, -25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, - 7, 25, 7, 25, 7, 25, 7, 25, 30, 2, 30, 2, 30, 2, 30, 2, -21, 11, 21, 11, 21, 11, 21, 11, 12, 20, 12, 20, 12, 20, 12, 20, - 3, 29, 3, 29, 3, 29, 3, 29, 26, 6, 26, 6, 26, 6, 26, 6, -17, 15, 17, 15, 17, 15, 17, 15, 8, 24, 8, 24, 8, 24, 8, 24, -31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, -13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, -27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, - 9, 23, 9, 23, 9, 23, 9, 23, 32, 0, 32, 0, 32, 0, 32, 0, -26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 32 Offset 3712 -14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, - 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, -22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, -10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, -30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, -18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, - 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, -29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 33 Offset 3840 -23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, -17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, -11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, - 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, -31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, -25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, -19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, -13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, - 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, - 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, -27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, -21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, -15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, - 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, - 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 Offset 4096 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 + 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 Offset 16 + 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, + 15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, + 21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, + 27, 5, 27, 5, 27, 5, 27, 5, 30, 2, 30, 2, 30, 2, 30, 2, + 1, 31, 1, 31, 1, 31, 1, 31, 4, 28, 4, 28, 4, 28, 4, 28, + 7, 25, 7, 25, 7, 25, 7, 25, 10, 22, 10, 22, 10, 22, 10, 22, + 13, 19, 13, 19, 13, 19, 13, 19, 16, 16, 16, 16, 16, 16, 16, 16, + 19, 13, 19, 13, 19, 13, 19, 13, 22, 10, 22, 10, 22, 10, 22, 10, + 25, 7, 25, 7, 25, 7, 25, 7, 28, 4, 28, 4, 28, 4, 28, 4, + 31, 1, 31, 1, 31, 1, 31, 1, 2, 30, 2, 30, 2, 30, 2, 30, + 5, 27, 5, 27, 5, 27, 5, 27, 8, 24, 8, 24, 8, 24, 8, 24, + 11, 21, 11, 21, 11, 21, 11, 21, 14, 18, 14, 18, 14, 18, 14, 18, + 17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, + 23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, + 29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 4 Offset 272 + 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, + 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, + 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, + 22, 10, 22, 10, 22, 10, 22, 10, 28, 4, 28, 4, 28, 4, 28, 4, + 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, + 14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, + 26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, + 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 5 Offset 400 + 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, + 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, + 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, + 17, 15, 17, 15, 17, 15, 17, 15, 26, 6, 26, 6, 26, 6, 26, 6, + 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, + 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, + 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, + 25, 7, 25, 7, 25, 7, 25, 7, 2, 30, 2, 30, 2, 30, 2, 30, + 11, 21, 11, 21, 11, 21, 11, 21, 20, 12, 20, 12, 20, 12, 20, 12, + 29, 3, 29, 3, 29, 3, 29, 3, 6, 26, 6, 26, 6, 26, 6, 26, + 15, 17, 15, 17, 15, 17, 15, 17, 24, 8, 24, 8, 24, 8, 24, 8, + 1, 31, 1, 31, 1, 31, 1, 31, 10, 22, 10, 22, 10, 22, 10, 22, + 19, 13, 19, 13, 19, 13, 19, 13, 28, 4, 28, 4, 28, 4, 28, 4, + 5, 27, 5, 27, 5, 27, 5, 27, 14, 18, 14, 18, 14, 18, 14, 18, + 23, 9, 23, 9, 23, 9, 23, 9, 32, 0, 32, 0, 32, 0, 32, 0, + 12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 6 Offset 656 + 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, + 28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, + 20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, + 14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 7 Offset 720 + 10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, + 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, + 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, + 30, 2, 30, 2, 30, 2, 30, 2, 12, 20, 12, 20, 12, 20, 12, 20, + 26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, + 22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, + 18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, + 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 8 Offset 848 + 18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 9 Offset 864 + 22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, + 26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, + 30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, + 2, 30, 2, 30, 2, 30, 2, 30, 20, 12, 20, 12, 20, 12, 20, 12, + 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, + 10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, + 14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, + 20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 10 Offset 992 + 28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, + 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, + 12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, + 22, 10, 22, 10, 22, 10, 22, 10, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 11 Offset 1056 + 2, 30, 2, 30, 2, 30, 2, 30, 24, 8, 24, 8, 24, 8, 24, 8, + 14, 18, 14, 18, 14, 18, 14, 18, 4, 28, 4, 28, 4, 28, 4, 28, + 26, 6, 26, 6, 26, 6, 26, 6, 16, 16, 16, 16, 16, 16, 16, 16, + 6, 26, 6, 26, 6, 26, 6, 26, 28, 4, 28, 4, 28, 4, 28, 4, + 18, 14, 18, 14, 18, 14, 18, 14, 8, 24, 8, 24, 8, 24, 8, 24, + 30, 2, 30, 2, 30, 2, 30, 2, 20, 12, 20, 12, 20, 12, 20, 12, + 10, 22, 10, 22, 10, 22, 10, 22, 32, 0, 32, 0, 32, 0, 32, 0, + 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 12 Offset 1184 + 8, 24, 8, 24, 8, 24, 8, 24, 32, 0, 32, 0, 32, 0, 32, 0, + 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 13 Offset 1216 + 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, + 22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, + 10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, + 30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, + 18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, + 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, + 28, 4, 28, 4, 28, 4, 28, 4, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 14 Offset 1344 + 20, 12, 20, 12, 20, 12, 20, 12, 16, 16, 16, 16, 16, 16, 16, 16, + 12, 20, 12, 20, 12, 20, 12, 20, 8, 24, 8, 24, 8, 24, 8, 24, + 4, 28, 4, 28, 4, 28, 4, 28, 32, 0, 32, 0, 32, 0, 32, 0, + 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 15 Offset 1408 + 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, + 17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, + 11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, + 31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, + 25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, + 19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, + 13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, + 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, + 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, + 27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, + 21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, + 15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, + 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, + 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, + 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 16 Offset 1664 + 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, + 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, + 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, + 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, + 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, + 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, + 2, 30, 2, 30, 2, 30, 2, 30, 32, 0, 32, 0, 32, 0, 32, 0, + 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, // Mode 17 Offset 1792 + 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, + 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, + 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, + 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, + 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, + 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, + 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, + 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, + 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, + 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, + 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, + 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, + 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, + 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, + 1, 31, 1, 31, 1, 31, 1, 31, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 Offset 2048 + 1, 31, 1, 31, 1, 31, 1, 31, 2, 30, 2, 30, 2, 30, 2, 30, // Mode 19 Offset 2064 + 3, 29, 3, 29, 3, 29, 3, 29, 4, 28, 4, 28, 4, 28, 4, 28, + 5, 27, 5, 27, 5, 27, 5, 27, 6, 26, 6, 26, 6, 26, 6, 26, + 7, 25, 7, 25, 7, 25, 7, 25, 8, 24, 8, 24, 8, 24, 8, 24, + 9, 23, 9, 23, 9, 23, 9, 23, 10, 22, 10, 22, 10, 22, 10, 22, + 11, 21, 11, 21, 11, 21, 11, 21, 12, 20, 12, 20, 12, 20, 12, 20, + 13, 19, 13, 19, 13, 19, 13, 19, 14, 18, 14, 18, 14, 18, 14, 18, + 15, 17, 15, 17, 15, 17, 15, 17, 16, 16, 16, 16, 16, 16, 16, 16, + 17, 15, 17, 15, 17, 15, 17, 15, 18, 14, 18, 14, 18, 14, 18, 14, + 19, 13, 19, 13, 19, 13, 19, 13, 20, 12, 20, 12, 20, 12, 20, 12, + 21, 11, 21, 11, 21, 11, 21, 11, 22, 10, 22, 10, 22, 10, 22, 10, + 23, 9, 23, 9, 23, 9, 23, 9, 24, 8, 24, 8, 24, 8, 24, 8, + 25, 7, 25, 7, 25, 7, 25, 7, 26, 6, 26, 6, 26, 6, 26, 6, + 27, 5, 27, 5, 27, 5, 27, 5, 28, 4, 28, 4, 28, 4, 28, 4, + 29, 3, 29, 3, 29, 3, 29, 3, 30, 2, 30, 2, 30, 2, 30, 2, + 31, 1, 31, 1, 31, 1, 31, 1, 32, 0, 32, 0, 32, 0, 32, 0, + 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 20 Offset 2320 + 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, + 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, + 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, + 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, + 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, + 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, + 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, + 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 21 Offset 2448 + 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, + 15, 17, 15, 17, 15, 17, 15, 17, 18, 14, 18, 14, 18, 14, 18, 14, + 21, 11, 21, 11, 21, 11, 21, 11, 24, 8, 24, 8, 24, 8, 24, 8, + 27, 5, 27, 5, 27, 5, 27, 5, 30, 2, 30, 2, 30, 2, 30, 2, + 1, 31, 1, 31, 1, 31, 1, 31, 4, 28, 4, 28, 4, 28, 4, 28, + 7, 25, 7, 25, 7, 25, 7, 25, 10, 22, 10, 22, 10, 22, 10, 22, + 13, 19, 13, 19, 13, 19, 13, 19, 16, 16, 16, 16, 16, 16, 16, 16, + 19, 13, 19, 13, 19, 13, 19, 13, 22, 10, 22, 10, 22, 10, 22, 10, + 25, 7, 25, 7, 25, 7, 25, 7, 28, 4, 28, 4, 28, 4, 28, 4, + 31, 1, 31, 1, 31, 1, 31, 1, 2, 30, 2, 30, 2, 30, 2, 30, + 5, 27, 5, 27, 5, 27, 5, 27, 8, 24, 8, 24, 8, 24, 8, 24, + 11, 21, 11, 21, 11, 21, 11, 21, 14, 18, 14, 18, 14, 18, 14, 18, + 17, 15, 17, 15, 17, 15, 17, 15, 20, 12, 20, 12, 20, 12, 20, 12, + 23, 9, 23, 9, 23, 9, 23, 9, 26, 6, 26, 6, 26, 6, 26, 6, + 29, 3, 29, 3, 29, 3, 29, 3, 32, 0, 32, 0, 32, 0, 32, 0, + 4, 28, 4, 28, 4, 28, 4, 28, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 22 Offset 2704 + 12, 20, 12, 20, 12, 20, 12, 20, 16, 16, 16, 16, 16, 16, 16, 16, + 20, 12, 20, 12, 20, 12, 20, 12, 24, 8, 24, 8, 24, 8, 24, 8, + 28, 4, 28, 4, 28, 4, 28, 4, 32, 0, 32, 0, 32, 0, 32, 0, + 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, // Mode 23 Offset 2768 + 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, + 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, + 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, + 22, 10, 22, 10, 22, 10, 22, 10, 28, 4, 28, 4, 28, 4, 28, 4, + 2, 30, 2, 30, 2, 30, 2, 30, 8, 24, 8, 24, 8, 24, 8, 24, + 14, 18, 14, 18, 14, 18, 14, 18, 20, 12, 20, 12, 20, 12, 20, 12, + 26, 6, 26, 6, 26, 6, 26, 6, 32, 0, 32, 0, 32, 0, 32, 0, + 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, // Mode 24 Offset 2896 + 24, 8, 24, 8, 24, 8, 24, 8, 32, 0, 32, 0, 32, 0, 32, 0, + 10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 25 Offset 2928 + 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, + 18, 14, 18, 14, 18, 14, 18, 14, 28, 4, 28, 4, 28, 4, 28, 4, + 6, 26, 6, 26, 6, 26, 6, 26, 16, 16, 16, 16, 16, 16, 16, 16, + 26, 6, 26, 6, 26, 6, 26, 6, 4, 28, 4, 28, 4, 28, 4, 28, + 14, 18, 14, 18, 14, 18, 14, 18, 24, 8, 24, 8, 24, 8, 24, 8, + 2, 30, 2, 30, 2, 30, 2, 30, 12, 20, 12, 20, 12, 20, 12, 20, + 22, 10, 22, 10, 22, 10, 22, 10, 32, 0, 32, 0, 32, 0, 32, 0, + 12, 20, 12, 20, 12, 20, 12, 20, 24, 8, 24, 8, 24, 8, 24, 8, // Mode 26 Offset 3056 + 4, 28, 4, 28, 4, 28, 4, 28, 16, 16, 16, 16, 16, 16, 16, 16, + 28, 4, 28, 4, 28, 4, 28, 4, 8, 24, 8, 24, 8, 24, 8, 24, + 20, 12, 20, 12, 20, 12, 20, 12, 32, 0, 32, 0, 32, 0, 32, 0, + 14, 18, 14, 18, 14, 18, 14, 18, 28, 4, 28, 4, 28, 4, 28, 4, // Mode 27 Offset 3120 + 10, 22, 10, 22, 10, 22, 10, 22, 24, 8, 24, 8, 24, 8, 24, 8, + 6, 26, 6, 26, 6, 26, 6, 26, 20, 12, 20, 12, 20, 12, 20, 12, + 2, 30, 2, 30, 2, 30, 2, 30, 16, 16, 16, 16, 16, 16, 16, 16, + 30, 2, 30, 2, 30, 2, 30, 2, 12, 20, 12, 20, 12, 20, 12, 20, + 26, 6, 26, 6, 26, 6, 26, 6, 8, 24, 8, 24, 8, 24, 8, 24, + 22, 10, 22, 10, 22, 10, 22, 10, 4, 28, 4, 28, 4, 28, 4, 28, + 18, 14, 18, 14, 18, 14, 18, 14, 32, 0, 32, 0, 32, 0, 32, 0, + 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 28 Offset 3248 + 18, 14, 18, 14, 18, 14, 18, 14, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 29 Offset 3264 + 22, 10, 22, 10, 22, 10, 22, 10, 8, 24, 8, 24, 8, 24, 8, 24, + 26, 6, 26, 6, 26, 6, 26, 6, 12, 20, 12, 20, 12, 20, 12, 20, + 30, 2, 30, 2, 30, 2, 30, 2, 16, 16, 16, 16, 16, 16, 16, 16, + 2, 30, 2, 30, 2, 30, 2, 30, 20, 12, 20, 12, 20, 12, 20, 12, + 6, 26, 6, 26, 6, 26, 6, 26, 24, 8, 24, 8, 24, 8, 24, 8, + 10, 22, 10, 22, 10, 22, 10, 22, 28, 4, 28, 4, 28, 4, 28, 4, + 14, 18, 14, 18, 14, 18, 14, 18, 32, 0, 32, 0, 32, 0, 32, 0, + 20, 12, 20, 12, 20, 12, 20, 12, 8, 24, 8, 24, 8, 24, 8, 24, // Mode 30 Offset 3392 + 28, 4, 28, 4, 28, 4, 28, 4, 16, 16, 16, 16, 16, 16, 16, 16, + 4, 28, 4, 28, 4, 28, 4, 28, 24, 8, 24, 8, 24, 8, 24, 8, + 12, 20, 12, 20, 12, 20, 12, 20, 32, 0, 32, 0, 32, 0, 32, 0, + 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode 31 Offset 3456 + 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, + 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, + 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, + 15, 17, 15, 17, 15, 17, 15, 17, 6, 26, 6, 26, 6, 26, 6, 26, + 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, + 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, + 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, + 7, 25, 7, 25, 7, 25, 7, 25, 30, 2, 30, 2, 30, 2, 30, 2, + 21, 11, 21, 11, 21, 11, 21, 11, 12, 20, 12, 20, 12, 20, 12, 20, + 3, 29, 3, 29, 3, 29, 3, 29, 26, 6, 26, 6, 26, 6, 26, 6, + 17, 15, 17, 15, 17, 15, 17, 15, 8, 24, 8, 24, 8, 24, 8, 24, + 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, + 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, + 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, + 9, 23, 9, 23, 9, 23, 9, 23, 32, 0, 32, 0, 32, 0, 32, 0, + 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 32 Offset 3712 + 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, + 22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, + 10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, + 30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, + 18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, + 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, + 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 33 Offset 3840 + 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, + 17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, + 11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, + 31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, + 25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, + 19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, + 13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, + 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, + 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, + 27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, + 21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, + 15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, + 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, + 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 Offset 4096 }; #endif INTRA_AVX2_TABLES_H From 6a0a043520e1c0355c5745624b68c441115c0c0a Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 31 Jan 2024 17:55:38 +0200 Subject: [PATCH 067/237] Implement faster version of intra avx2 chroma linear interpolation for horizontal w16. Add weight and shuffle tables for w16. --- src/strategies/avx2/intra-avx2.c | 65 +++++++------- src/strategies/avx2/intra_avx2_tables.h | 108 ++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 31 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 13d4b823..767f8e23 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1566,42 +1566,45 @@ static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; - const __m128i v16s = _mm_set1_epi16(16); + const __m256i v16s = _mm256_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 64; + const int16_t shuf_offset = (mode - 2) * 64; - int8_t tmp_coeff[32]; - for (int x = 0, offset = 0; x < 16; ++x, offset += 2) { - tmp_coeff[offset + 0] = 32 - dfract[x]; - tmp_coeff[offset + 1] = dfract[x]; - } - __m128i* vcoeff0 = (__m128i*) &tmp_coeff[0]; - __m128i* vcoeff1 = (__m128i*) &tmp_coeff[16]; + __m256i vcoeff0 = _mm256_load_si256((const __m256i*) & intra_chroma_linear_interpolation_weights_w16_hor[weigth_offset + 0]); + __m256i vcoeff1 = _mm256_load_si256((const __m256i*) & intra_chroma_linear_interpolation_weights_w16_hor[weigth_offset + 32]); + __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 0]); + __m128i vshuf2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 16]); // Swap the middle two shuffle vectors. Packus will swap results back into place. + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 32]); + __m128i vshuf3 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 48]); - // Height has to be at least 1, handle 1 line at a time - for (int y = 0; y < height; ++y) { - // TODO: find a more efficient way to do this - uvg_pixel src[32]; - for (int x = 0, offset = 0; x < 16; ++x, offset += 2) { - const int ref_offset = dint[x] + y + 1; - src[offset + 0] = ref[ref_offset + 0]; - src[offset + 1] = ref[ref_offset + 1]; - } - - __m128i* vsrc0 = (__m128i*) & src[0]; - __m128i* vsrc1 = (__m128i*) & src[16]; + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset = 1 + MIN(dint[0], dint[15]); - __m128i res0 = _mm_maddubs_epi16(*vsrc0, *vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, *vcoeff1); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); + // Height has to be at least 2, there is no 16x1 block for chroma. Handle 2 lines at once with 256-bit vectors. + for (int y = 0; y < height; y += 2) { + // Prepare sources + __m128i vsrc_tmp = _mm_loadu_si128((__m128i*) & ref[min_offset + y]); + __m128i vsrc[4]; + vsrc[0] = _mm_shuffle_epi8(vsrc_tmp, vshuf0); + vsrc[1] = _mm_shuffle_epi8(vsrc_tmp, vshuf1); + vsrc[2] = _mm_shuffle_epi8(vsrc_tmp, vshuf2); + vsrc[3] = _mm_shuffle_epi8(vsrc_tmp, vshuf3); - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; + const __m256i* vsrc256_0 = (const __m256i*)&vsrc[0]; + const __m256i* vsrc256_1 = (const __m256i*)&vsrc[2]; + + __m256i res0 = _mm256_maddubs_epi16(*vsrc256_0, vcoeff0); + __m256i res1 = _mm256_maddubs_epi16(*vsrc256_1, vcoeff1); + res0 = _mm256_add_epi16(res0, v16s); + res1 = _mm256_add_epi16(res1, v16s); + res0 = _mm256_srai_epi16(res0, 5); + res1 = _mm256_srai_epi16(res1, 5); + + _mm256_store_si256((__m256i*)dst, _mm256_packus_epi16(res0, res1)); + dst += 32; } } @@ -2393,7 +2396,7 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, pred_mode, delta_int); break; case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 233057b7..1282cd64 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -246,6 +246,76 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_ho }; +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[] = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, // Mode 2 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, 0x10, 0x11, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, // Mode 3 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, // Mode 4 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, // Mode 5 + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0b, 0x0c, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, // Mode 6 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, // Mode 7 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, // Mode 8 + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, // Mode 9 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, // Mode 10 + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 11 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 12 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 13 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 14 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 15 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 16 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 17 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 18 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 19 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 20 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 21 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 22 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 23 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 24 + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 25 + 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 26 + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 27 + 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 28 + 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 29 + 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 30 + 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 31 + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 32 + 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 Note: this mode does not work. + 0x10, 0x11, 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02 +}; + + // Chroma linear interpolation filter weights for width 4, horizontal modes ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 @@ -322,6 +392,44 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { }; +// Chroma linear interpolation filter weights for width 8, horizontal modes. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, // Mode 5 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 + 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 + 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 + 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, // Mode 15 + 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 + 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 + 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, // Mode 31 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, // Mode 33 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +}; + + // Chroma linear interpolation filter weights for width 4, vertical modes. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 From 9fcd73275a324b00c3543f088e074d0c9f902a6e Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 31 Jan 2024 18:38:43 +0200 Subject: [PATCH 068/237] Implement faster version of intra avx2 chroma linear interpolation for horizontal w32. Add weight and shuffle tables for w32. --- src/strategies/avx2/intra-avx2.c | 56 ++++++------ src/strategies/avx2/intra_avx2_tables.h | 112 +++++++++++++++++++++++- 2 files changed, 139 insertions(+), 29 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 767f8e23..314ee297 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1609,46 +1609,48 @@ static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int16_t* dint = delta_int; const int16_t* dfract = delta_fract; const __m256i v16s = _mm256_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 64; + const int16_t shuf_offset = (mode - 2) * 64; - int8_t tmp_coeff[64]; - for (int x = 0, offset = 0; x < 32; ++x, offset += 2) { - tmp_coeff[offset + 0] = 32 - dfract[x]; - tmp_coeff[offset + 1] = dfract[x]; - } - __m256i* vcoeff0 = (__m256i*) &tmp_coeff[0]; - __m256i* vcoeff1 = (__m256i*) &tmp_coeff[32]; + __m256i vcoeff0 = _mm256_load_si256((const __m256i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 0]); + __m256i vcoeff1 = _mm256_load_si256((const __m256i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 32]); + __m128i vshuf0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 0]); + __m128i vshuf1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 16]); + __m128i vshuf2 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 32]); + __m128i vshuf3 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 48]); - // Height has to be at least 1, handle 1 line at a time - for (int y = 0; y < height; ++y) { - // TODO: find a more efficient way to do this - uvg_pixel src[64]; - for (int x = 0, offset = 0; x < 32; ++x, offset += 2) { - const int ref_offset = dint[x] + y + 1; - src[offset + 0] = ref[ref_offset + 0]; - src[offset + 1] = ref[ref_offset + 1]; - } + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + // Due to width, two loads are needed, and therefore two offsets. Cannot use 256-bit loads due to alignment issues. + const int16_t min_offset0 = 1 + MIN(dint[0], dint[15]); + const int16_t min_offset1 = 1 + MIN(dint[16], dint[31]); + // Height has to be at least 2. Due to width, handle 1 line at a time + for (int y = 0; y < height; ++y) { + // Prepare sources + __m128i vsrc_tmp0 = _mm_loadu_si128((__m128i*) &ref[min_offset0 + y]); + __m128i vsrc_tmp1 = _mm_loadu_si128((__m128i*) &ref[min_offset1 + y]); + __m128i vsrc[4]; + vsrc[0] = _mm_shuffle_epi8(vsrc_tmp0, vshuf0); + vsrc[1] = _mm_shuffle_epi8(vsrc_tmp1, vshuf2); // Swap the middle two shuffle vectors and sources. Packus will swap results back into place. Corresponding weights are also swapped in the table. + vsrc[2] = _mm_shuffle_epi8(vsrc_tmp0, vshuf1); + vsrc[3] = _mm_shuffle_epi8(vsrc_tmp1, vshuf3); - __m256i* vsrc0 = (__m256i*) &src[0]; - __m256i* vsrc1 = (__m256i*) &src[32]; + const __m256i* vsrc256_0 = (const __m256i*) &vsrc[0]; + const __m256i* vsrc256_1 = (const __m256i*) &vsrc[2]; - __m256i res0 = _mm256_maddubs_epi16(*vsrc0, *vcoeff0); - __m256i res1 = _mm256_maddubs_epi16(*vsrc1, *vcoeff1); + __m256i res0 = _mm256_maddubs_epi16(*vsrc256_0, vcoeff0); + __m256i res1 = _mm256_maddubs_epi16(*vsrc256_1, vcoeff1); res0 = _mm256_add_epi16(res0, v16s); res1 = _mm256_add_epi16(res1, v16s); res0 = _mm256_srai_epi16(res0, 5); res1 = _mm256_srai_epi16(res1, 5); - //res0 = _mm256_permute4x64_epi64(res0, _MM_SHUFFLE(3, 1, 2, 0)); - //res1 = _mm256_permute4x64_epi64(res1, _MM_SHUFFLE(3, 1, 2, 0)); - __m256i res_final = _mm256_packus_epi16(res0, res1); - res_final = _mm256_permute4x64_epi64(res_final, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_store_si256((__m256i*)dst, res_final); + _mm256_store_si256((__m256i*)dst, _mm256_packus_epi16(res0, res1)); dst += 32; } } @@ -2397,7 +2399,7 @@ static void uvg_angular_pred_avx2( case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, pred_mode, delta_int); break; case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 1282cd64..8c5196c8 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -311,11 +311,81 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_h 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, - 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 Note: this mode does not work. + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 0x10, 0x11, 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02 }; +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[] = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, // Mode 2 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, // Mode 3 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, // Mode 4 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, // Mode 5 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, // Mode 6 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, // Mode 7 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, // Mode 8 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, // Mode 9 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, // Mode 10 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 11 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 12 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 13 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 14 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 15 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 16 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 17 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 18 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 19 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 20 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 21 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 22 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 23 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 24 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 25 + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 26 + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 27 + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 28 + 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 29 + 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 30 + 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 31 + 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 32 + 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 + 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 + 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, +}; + + // Chroma linear interpolation filter weights for width 4, horizontal modes ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 @@ -392,7 +462,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { }; -// Chroma linear interpolation filter weights for width 8, horizontal modes. +// Chroma linear interpolation filter weights for width 16, horizontal modes. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 3 @@ -430,6 +500,44 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { }; +// Chroma linear interpolation filter weights for width 32, horizontal modes. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w32_hor[] = { +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 25, 7, 2, 30, 11, 21, 20, 12, 29, 3, 6, 26, 15, 17, 24, 8, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 1, 31, 10, 22, 19, 13, 28, 4, 5, 27, 14, 18, 23, 9, 32, 0, // Mode 5 +12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 +14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 +16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 +18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 +20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 +22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 +24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 +26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 +28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 +29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 15 +30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 +31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 15, 17, 14, 18, 13, 19, 12, 20, 11, 21, 10, 22, 9, 23, 8, 24, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, 7, 25, 6, 26, 5, 27, 4, 28, 3, 29, 2, 30, 1, 31, 32, 0, // Mode 17 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 17, 15, 18, 14, 19, 13, 20, 12, 21, 11, 22, 10, 23, 9, 24, 8, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, 25, 7, 26, 6, 27, 5, 28, 4, 29, 3, 30, 2, 31, 1, 32, 0, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 +10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 +12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 +14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 +16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 +18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 +20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 +23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode 31 +26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 +29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 33 +32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +}; + + // Chroma linear interpolation filter weights for width 4, vertical modes. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 From c2f1bef479ff5a4cc08b7a730d560f6edb037b81 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 2 Feb 2024 15:54:10 +0200 Subject: [PATCH 069/237] Alternate version of chroma linear interpolation w16 horizontal. --- src/strategies/avx2/intra-avx2.c | 53 ++--- src/strategies/avx2/intra_avx2_tables.h | 264 +++++++++++++++--------- 2 files changed, 194 insertions(+), 123 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 314ee297..e1f8593c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1569,41 +1569,46 @@ static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* re static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; - const __m256i v16s = _mm256_set1_epi16(16); - const int16_t weigth_offset = (mode - 2) * 64; + const __m128i v16s = _mm_set1_epi16(16); + const int16_t weigth_offset = (mode - 2) * 32; const int16_t shuf_offset = (mode - 2) * 64; - __m256i vcoeff0 = _mm256_load_si256((const __m256i*) & intra_chroma_linear_interpolation_weights_w16_hor[weigth_offset + 0]); - __m256i vcoeff1 = _mm256_load_si256((const __m256i*) & intra_chroma_linear_interpolation_weights_w16_hor[weigth_offset + 32]); + __m128i vcoeff0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor[weigth_offset + 0]); + __m128i vcoeff1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor[weigth_offset + 16]); __m128i vshuf0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 0]); - __m128i vshuf2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 16]); // Swap the middle two shuffle vectors. Packus will swap results back into place. - __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 32]); + __m128i vshuf1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 16]); + __m128i vshuf2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 32]); __m128i vshuf3 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[shuf_offset + 48]); // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices - const int16_t min_offset = 1 + MIN(dint[0], dint[15]); + const int16_t min_offset0 = 1 + MIN(dint[0], dint[7]); + const int16_t min_offset1 = 1 + MIN(dint[8], dint[15]); - // Height has to be at least 2, there is no 16x1 block for chroma. Handle 2 lines at once with 256-bit vectors. + // Height has to be at least 2, there is no 16x1 block for chroma. for (int y = 0; y < height; y += 2) { // Prepare sources - __m128i vsrc_tmp = _mm_loadu_si128((__m128i*) & ref[min_offset + y]); - __m128i vsrc[4]; - vsrc[0] = _mm_shuffle_epi8(vsrc_tmp, vshuf0); - vsrc[1] = _mm_shuffle_epi8(vsrc_tmp, vshuf1); - vsrc[2] = _mm_shuffle_epi8(vsrc_tmp, vshuf2); - vsrc[3] = _mm_shuffle_epi8(vsrc_tmp, vshuf3); - - const __m256i* vsrc256_0 = (const __m256i*)&vsrc[0]; - const __m256i* vsrc256_1 = (const __m256i*)&vsrc[2]; + __m128i vsrc_tmp0 = _mm_loadu_si128((__m128i*) &ref[min_offset0 + y]); + __m128i vsrc_tmp1 = _mm_loadu_si128((__m128i*) &ref[min_offset1 + y]); + const __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp0, vshuf0); + const __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp1, vshuf1); + const __m128i vsrc2 = _mm_shuffle_epi8(vsrc_tmp0, vshuf2); + const __m128i vsrc3 = _mm_shuffle_epi8(vsrc_tmp1, vshuf3); - __m256i res0 = _mm256_maddubs_epi16(*vsrc256_0, vcoeff0); - __m256i res1 = _mm256_maddubs_epi16(*vsrc256_1, vcoeff1); - res0 = _mm256_add_epi16(res0, v16s); - res1 = _mm256_add_epi16(res1, v16s); - res0 = _mm256_srai_epi16(res0, 5); - res1 = _mm256_srai_epi16(res1, 5); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff0); + __m128i res3 = _mm_maddubs_epi16(vsrc3, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res2 = _mm_add_epi16(res2, v16s); + res3 = _mm_add_epi16(res3, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + res2 = _mm_srai_epi16(res2, 5); + res3 = _mm_srai_epi16(res3, 5); - _mm256_store_si256((__m256i*)dst, _mm256_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*)&dst[0], _mm_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*)&dst[16], _mm_packus_epi16(res2, res3)); dst += 32; } } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 8c5196c8..d8fb08dc 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -247,72 +247,138 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_ho ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[] = { - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, // Mode 2 - 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, 0x10, 0x11, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, // Mode 3 - 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, // Mode 4 - 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0a, 0x0b, 0x0b, 0x0c, // Mode 5 - 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0b, 0x0c, 0x0c, 0x0d, - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, // Mode 6 - 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, // Mode 7 - 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, // Mode 8 - 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, 0x09, 0x0a, - 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, // Mode 9 - 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, - 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, // Mode 10 - 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 11 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 12 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 13 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 14 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 15 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 16 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 17 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 18 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 19 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 20 - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 21 - 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 22 - 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 23 - 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 24 - 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 25 - 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 26 - 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 27 - 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, - 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 28 - 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, - 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 29 - 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, - 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 30 - 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, - 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 31 - 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, - 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 32 - 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, - 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 - 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, - 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 - 0x10, 0x11, 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 2 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 3 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 4 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // Mode 5 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // Mode 6 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 7 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 8 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, // Mode 9 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 10 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, // Mode 11 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, + 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 12 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 13 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, // Mode 14 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 15 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 16 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 17 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 18 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 19 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 20 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 21 + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 22 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 23 + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 24 + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 25 + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, // Mode 26 + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 27 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 28 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, // Mode 29 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 30 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, // Mode 31 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 32 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 33 + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, // Mode 34 + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, }; @@ -464,39 +530,39 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { // Chroma linear interpolation filter weights for width 16, horizontal modes. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 - 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 3 - 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 - 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, // Mode 5 - 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 - 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 - 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 - 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 - 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 - 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 - 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 - 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 - 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 - 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, // Mode 15 - 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 - 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, // Mode 17 - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 - 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, // Mode 19 - 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 - 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 21 - 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 - 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 - 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 - 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 - 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 - 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 - 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 - 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 - 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 - 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, // Mode 31 - 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 - 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, // Mode 33 - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, // Mode 5 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 + 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 + 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 + 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, // Mode 15 + 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 + 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 + 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, // Mode 31 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, // Mode 33 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 }; From 1d79bbb62032ab37dcf68c948b9810f4ef8b313b Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 6 Feb 2024 16:19:58 +0200 Subject: [PATCH 070/237] Faster version of chroma linear interpolation w32 horizontal. --- src/strategies/avx2/intra-avx2.c | 41 ++++++++------- src/strategies/avx2/intra_avx2_tables.h | 66 ++++++++++++------------- 2 files changed, 56 insertions(+), 51 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e1f8593c..069fc326 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1618,12 +1618,14 @@ static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* r { const int16_t* dint = delta_int; const int16_t* dfract = delta_fract; - const __m256i v16s = _mm256_set1_epi16(16); + const __m128i v16s = _mm_set1_epi16(16); const int16_t weigth_offset = (mode - 2) * 64; const int16_t shuf_offset = (mode - 2) * 64; - __m256i vcoeff0 = _mm256_load_si256((const __m256i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 0]); - __m256i vcoeff1 = _mm256_load_si256((const __m256i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 32]); + __m128i vcoeff0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 0]); + __m128i vcoeff1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 16]); + __m128i vcoeff2 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 32]); + __m128i vcoeff3 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w32_hor[weigth_offset + 48]); __m128i vshuf0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 0]); __m128i vshuf1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 16]); __m128i vshuf2 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[shuf_offset + 32]); @@ -1639,23 +1641,26 @@ static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* r // Prepare sources __m128i vsrc_tmp0 = _mm_loadu_si128((__m128i*) &ref[min_offset0 + y]); __m128i vsrc_tmp1 = _mm_loadu_si128((__m128i*) &ref[min_offset1 + y]); - __m128i vsrc[4]; - vsrc[0] = _mm_shuffle_epi8(vsrc_tmp0, vshuf0); - vsrc[1] = _mm_shuffle_epi8(vsrc_tmp1, vshuf2); // Swap the middle two shuffle vectors and sources. Packus will swap results back into place. Corresponding weights are also swapped in the table. - vsrc[2] = _mm_shuffle_epi8(vsrc_tmp0, vshuf1); - vsrc[3] = _mm_shuffle_epi8(vsrc_tmp1, vshuf3); + __m128i vsrc0 = _mm_shuffle_epi8(vsrc_tmp0, vshuf0); + __m128i vsrc1 = _mm_shuffle_epi8(vsrc_tmp0, vshuf1); + __m128i vsrc2 = _mm_shuffle_epi8(vsrc_tmp1, vshuf2); + __m128i vsrc3 = _mm_shuffle_epi8(vsrc_tmp1, vshuf3); - const __m256i* vsrc256_0 = (const __m256i*) &vsrc[0]; - const __m256i* vsrc256_1 = (const __m256i*) &vsrc[2]; - - __m256i res0 = _mm256_maddubs_epi16(*vsrc256_0, vcoeff0); - __m256i res1 = _mm256_maddubs_epi16(*vsrc256_1, vcoeff1); - res0 = _mm256_add_epi16(res0, v16s); - res1 = _mm256_add_epi16(res1, v16s); - res0 = _mm256_srai_epi16(res0, 5); - res1 = _mm256_srai_epi16(res1, 5); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff2); + __m128i res3 = _mm_maddubs_epi16(vsrc3, vcoeff3); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res2 = _mm_add_epi16(res2, v16s); + res3 = _mm_add_epi16(res3, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + res2 = _mm_srai_epi16(res2, 5); + res3 = _mm_srai_epi16(res3, 5); - _mm256_store_si256((__m256i*)dst, _mm256_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*)&dst[0], _mm_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*)&dst[16], _mm_packus_epi16(res2, res3)); dst += 32; } } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index d8fb08dc..074a2f70 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -568,39 +568,39 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { // Chroma linear interpolation filter weights for width 32, horizontal modes. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w32_hor[] = { -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 - 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 - 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 - 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 25, 7, 2, 30, 11, 21, 20, 12, 29, 3, 6, 26, 15, 17, 24, 8, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 1, 31, 10, 22, 19, 13, 28, 4, 5, 27, 14, 18, 23, 9, 32, 0, // Mode 5 -12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 -14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 -16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 -18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 -20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 -22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 -24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 -26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 -28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 -29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 15 -30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 -31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 15, 17, 14, 18, 13, 19, 12, 20, 11, 21, 10, 22, 9, 23, 8, 24, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, 7, 25, 6, 26, 5, 27, 4, 28, 3, 29, 2, 30, 1, 31, 32, 0, // Mode 17 -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 - 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 17, 15, 18, 14, 19, 13, 20, 12, 21, 11, 22, 10, 23, 9, 24, 8, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, 25, 7, 26, 6, 27, 5, 28, 4, 29, 3, 30, 2, 31, 1, 32, 0, // Mode 19 - 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 - 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 21 - 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 - 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 - 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 -10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 -12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 -14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 -16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 -18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 -20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 -23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode 31 -26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 -29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 33 -32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 25, 7, 2, 30, 11, 21, 20, 12, 29, 3, 6, 26, 15, 17, 24, 8, 1, 31, 10, 22, 19, 13, 28, 4, 5, 27, 14, 18, 23, 9, 32, 0, // Mode 5 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 + 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 + 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 + 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 15 + 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 + 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, 15, 17, 14, 18, 13, 19, 12, 20, 11, 21, 10, 22, 9, 23, 8, 24, 7, 25, 6, 26, 5, 27, 4, 28, 3, 29, 2, 30, 1, 31, 32, 0, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, 17, 15, 18, 14, 19, 13, 20, 12, 21, 11, 22, 10, 23, 9, 24, 8, 25, 7, 26, 6, 27, 5, 28, 4, 29, 3, 30, 2, 31, 1, 32, 0, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 + 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode 31 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 33 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 }; From 0370e8857f39420cb647ee8f31552c04b57b90be Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 6 Feb 2024 23:56:03 +0200 Subject: [PATCH 071/237] Implement alternate version of chroma linear interpolation w4 vertical. Replaced ref gather with 128-bit load. Shuffle vectors are loaded from a table. --- src/strategies/avx2/intra-avx2.c | 29 ++++++------- src/strategies/avx2/intra_avx2_tables.h | 58 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 069fc326..193b9267 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1339,34 +1339,33 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); - const __m256i vshuf = _mm256_setr_epi8( - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c - ); const int mode_idx = wide_angle_mode ? 0 : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); - const int table_offset = coeff_table_mode_offsets[mode_idx]; + const int weight_table_offset = coeff_table_mode_offsets[mode_idx]; const int vnum = coeff_vector128_num_by_mode[mode_idx]; const int modulo = vnum - 1; int offset_num = 0; + int16_t shuffle_vector_offsets[8]; + memcpy(shuffle_vector_offsets, &intra_chroma_linear_interpolation_w4_ver_shuffle_vector_offset[mode_idx * 8], sizeof(int16_t) * 8); + // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { - const int offset = table_offset + (offset_num * 16); - const __m256i vidx = _mm256_setr_epi64x(dint[0]+1, dint[1]+1, dint[2]+1, dint[3]+1); + // Load refs from smallest index onwards, shuffle will handle the rest. The smallest index will be at one of these delta int table indices + const int16_t min_offset = 1 + MIN(dint[0], dint[3]); dint += 4; - + // Load enough reff samples to cover four 4 width lines. Shuffles will put the samples in correct places. + const __m128i vsrc_raw = _mm_loadu_si128((const __m128i*) & ref[min_offset]); + const int offset = weight_table_offset + (offset_num * 16); + const __m128i vcoeff0 = _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_weights_w4_ver[offset]); const __m128i vcoeff1 = vnum == 1 ? vcoeff0 : _mm_load_si128((const __m128i*)&intra_chroma_linear_interpolation_weights_w4_ver[offset + 16]); - __m256i vsrc; - vsrc = _mm256_i64gather_epi64((const long long int*)ref, vidx, 1); - vsrc = _mm256_shuffle_epi8(vsrc, vshuf); + const __m128i vshuf0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w4_ver[shuffle_vector_offsets[y >> 2] + 0]); + const __m128i vshuf1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_shuffle_vectors_w4_ver[shuffle_vector_offsets[y >> 2] + 16]); - __m128i vsrc0 = _mm256_extracti128_si256(vsrc, 0); - __m128i vsrc1 = _mm256_extracti128_si256(vsrc, 1); + __m128i vsrc0 = _mm_shuffle_epi8(vsrc_raw, vshuf0); + __m128i vsrc1 = _mm_shuffle_epi8(vsrc_raw, vshuf1); __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 074a2f70..19961b92 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -40,6 +40,64 @@ ALIGNED(32) const int16_t mode_to_shuffle_vector_table_offset_w4_hor[35] = { }; +// Index with (mode - 2) * 8 + (y >> 2). The given index will point to the correct place in shuffle vector table. +ALIGNED(32) const int16_t intra_chroma_linear_interpolation_w4_ver_shuffle_vector_offset[] = { + 0, 0, 0, 0, 0, 0, 0, 0, // Mode 2 + 0, 0, 32, 0, 0, 64, 0, 0, // Mode 3 + 0, 64, 32, 0, 0, 64, 32, 0, // Mode 4 + 96, 96, 32, 32, 64, 64, 0, 0, // Mode 5 + 32, 64, 32, 64, 32, 64, 32, 64, // Mode 6 + 32, 32, 128, 64, 32, 32, 128, 64, // Mode 7 + 32, 32, 32, 32, 32, 32, 32, 32, // Mode 8 + 128, 128, 32, 32, 128, 128, 32, 32, // Mode 9 + 128, 32, 128, 32, 128, 32, 128, 32, // Mode 10 + 160, 128, 192, 160, 160, 128, 192, 160, // Mode 11 + 160, 160, 160, 160, 160, 160, 160, 160, // Mode 12 + 224, 192, 128, 160, 224, 192, 128, 160, // Mode 13 + 224, 160, 224, 160, 224, 160, 224, 160, // Mode 14 + 224, 224, 128, 224, 224, 192, 224, 160, // Mode 15 + 224, 224, 224, 160, 224, 224, 224, 160, // Mode 16 + 224, 224, 224, 224, 224, 224, 224, 160, // Mode 17 + 224, 224, 224, 224, 224, 224, 224, 224, // Mode 18 + 224, 224, 224, 224, 224, 224, 224, 224, // Mode 19 + 224, 224, 224, 224, 224, 224, 224, 224, // Mode 20 + 224, 224, 256, 224, 224, 288, 224, 224, // Mode 21 + 224, 224, 224, 224, 224, 224, 224, 224, // Mode 22 + 224, 288, 256, 224, 224, 288, 256, 224, // Mode 23 + 224, 224, 224, 224, 224, 224, 224, 224, // Mode 24 + 320, 256, 288, 224, 320, 256, 288, 224, // Mode 25 + 256, 288, 256, 288, 256, 288, 256, 288, // Mode 26 + 256, 256, 352, 288, 256, 256, 352, 288, // Mode 27 + 256, 256, 256, 256, 256, 256, 256, 256, // Mode 28 + 352, 352, 256, 256, 352, 352, 256, 256, // Mode 29 + 352, 256, 352, 256, 352, 256, 352, 256, // Mode 30 + 384, 384, 352, 352, 416, 416, 448, 384, // Mode 31 + 448, 416, 352, 384, 448, 416, 352, 384, // Mode 32 + 448, 448, 352, 448, 448, 416, 448, 384, // Mode 33 + 448, 448, 448, 448, 448, 448, 448, 448 // Mode 34 +}; + + +// Shuffle vectors for w4 vertical. This is indexed based on the shape of delta int table for each mode. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ver[] = { // Shape of the delta int table in sets of four + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // [0, 1, 2, 3] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // [0, 1, 1, 2] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // [0, 0, 1, 2] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // [0, 1, 2, 2] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // [0, 0, 1, 1] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // [0, 0, 0, 1] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // [0, 1, 1, 1] + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [0, 0, 0, 0] + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [1, 1, 0, 0] + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [1, 0, 0, 0] + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [1, 1, 1, 0] + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [2, 1, 1, 0] + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [2, 1, 0, 0] + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [2, 2, 1, 0] + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // [3, 2, 1, 0] +}; + + ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[] = { 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 2 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, From 8fe8308adef7360a3a4b9387c1329b4e43277414 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 9 Feb 2024 15:36:38 +0200 Subject: [PATCH 072/237] Implement faster vertical w8. Use coeff tables instead of building coeffs during runtime. --- src/strategies/avx2/intra-avx2.c | 21 ++++++++------ src/strategies/avx2/intra_avx2_tables.h | 37 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 193b9267..df75ed4d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1346,7 +1346,7 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re const int modulo = vnum - 1; int offset_num = 0; - int16_t shuffle_vector_offsets[8]; + ALIGNED(16) int16_t shuffle_vector_offsets[8]; memcpy(shuffle_vector_offsets, &intra_chroma_linear_interpolation_w4_ver_shuffle_vector_offset[mode_idx * 8], sizeof(int16_t) * 8); // Height has to be at least 4, handle 4 lines at once @@ -1383,7 +1383,7 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const int width = 8; const __m128i v16s = _mm_set1_epi16(16); @@ -1392,23 +1392,28 @@ static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* re 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); + const int mode_idx = (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int coeff_table_offset = mode_idx * 64; + // Height has to be at least 2, handle 2 lines at once for (int y = 0; y < height; y += 2) { - int8_t tmp[2] = {32 - delta_fract[y + 0], delta_fract[y + 0]}; + /*int8_t tmp[2] = {32 - delta_fract[y + 0], delta_fract[y + 0]}; int16_t coeff_tmp0 = *(int16_t*)tmp; tmp[0] = 32 - delta_fract[y + 1]; tmp[1] = delta_fract[y + 1]; - int16_t coeff_tmp1 = *(int16_t*)tmp; - + int16_t coeff_tmp1 = *(int16_t*)tmp;*/ + const int16_t* coeff_tmp0 = (const int16_t*) &intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1) + 0]; + const int16_t* coeff_tmp1 = (const int16_t*) &intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1) + 2]; + __m128i vsrc0 = _mm_loadu_si128((const __m128i*) & ref[delta_int[y + 0] + 1]); __m128i vsrc1 = _mm_loadu_si128((const __m128i*) & ref[delta_int[y + 1] + 1]); vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); - const __m128i vcoeff0 = _mm_set1_epi16(coeff_tmp0); - const __m128i vcoeff1 = _mm_set1_epi16(coeff_tmp1); + const __m128i vcoeff0 = _mm_set1_epi16(*coeff_tmp0); + const __m128i vcoeff1 = _mm_set1_epi16(*coeff_tmp1); __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); @@ -2395,7 +2400,7 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, wide_angle_mode, pred_mode); break; - case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, delta_fract); break; + case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, delta_fract); break; case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, delta_fract); break; default: diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 19961b92..3ae01f98 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -510,6 +510,43 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_h }; +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 + 9, 23, 18, 14, 27, 5, 4, 28, 13, 19, 22, 10, 31, 1, 8, 24, 17, 15, 26, 6, 3, 29, 12, 20, 21, 11, 30, 2, 7, 25, 16, 16, 25, 7, 2, 30, 11, 21, 20, 12, 29, 3, 6, 26, 15, 17, 24, 8, 1, 31, 10, 22, 19, 13, 28, 4, 5, 27, 14, 18, 23, 9, 32, 0, // Mode 5 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 6 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 7 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 8 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 9 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 10 + 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, 22, 10, 12, 20, 2, 30, 24, 8, 14, 18, 4, 28, 26, 6, 16, 16, 6, 26, 28, 4, 18, 14, 8, 24, 30, 2, 20, 12, 10, 22, 32, 0, // Mode 11 + 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, 24, 8, 16, 16, 8, 24, 32, 0, // Mode 12 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 13 + 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, 28, 4, 24, 8, 20, 12, 16, 16, 12, 20, 8, 24, 4, 28, 32, 0, // Mode 14 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 15 + 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, 30, 2, 28, 4, 26, 6, 24, 8, 22, 10, 20, 12, 18, 14, 16, 16, 14, 18, 12, 20, 10, 22, 8, 24, 6, 26, 4, 28, 2, 30, 32, 0, // Mode 16 + 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8, 23, 9, 22, 10, 21, 11, 20, 12, 19, 13, 18, 14, 17, 15, 16, 16, 15, 17, 14, 18, 13, 19, 12, 20, 11, 21, 10, 22, 9, 23, 8, 24, 7, 25, 6, 26, 5, 27, 4, 28, 3, 29, 2, 30, 1, 31, 32, 0, // Mode 17 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 18 + 1, 31, 2, 30, 3, 29, 4, 28, 5, 27, 6, 26, 7, 25, 8, 24, 9, 23, 10, 22, 11, 21, 12, 20, 13, 19, 14, 18, 15, 17, 16, 16, 17, 15, 18, 14, 19, 13, 20, 12, 21, 11, 22, 10, 23, 9, 24, 8, 25, 7, 26, 6, 27, 5, 28, 4, 29, 3, 30, 2, 31, 1, 32, 0, // Mode 19 + 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, 2, 30, 4, 28, 6, 26, 8, 24, 10, 22, 12, 20, 14, 18, 16, 16, 18, 14, 20, 12, 22, 10, 24, 8, 26, 6, 28, 4, 30, 2, 32, 0, // Mode 20 + 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 21 + 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, 4, 28, 8, 24, 12, 20, 16, 16, 20, 12, 24, 8, 28, 4, 32, 0, // Mode 22 + 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 23 + 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, 8, 24, 16, 16, 24, 8, 32, 0, // Mode 24 + 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode 25 + 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, 12, 20, 24, 8, 4, 28, 16, 16, 28, 4, 8, 24, 20, 12, 32, 0, // Mode 26 + 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, 14, 18, 28, 4, 10, 22, 24, 8, 6, 26, 20, 12, 2, 30, 16, 16, 30, 2, 12, 20, 26, 6, 8, 24, 22, 10, 4, 28, 18, 14, 32, 0, // Mode 27 + 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, 16, 16, 32, 0, // Mode 28 + 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, 18, 14, 4, 28, 22, 10, 8, 24, 26, 6, 12, 20, 30, 2, 16, 16, 2, 30, 20, 12, 6, 26, 24, 8, 10, 22, 28, 4, 14, 18, 32, 0, // Mode 29 + 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, 20, 12, 8, 24, 28, 4, 16, 16, 4, 28, 24, 8, 12, 20, 32, 0, // Mode 30 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode 31 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode 32 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 33 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 +}; + + // Chroma linear interpolation filter weights for width 4, horizontal modes ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 From 7a5f64ccef6e49b6d12546e5932a9cdecebc6e52 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 12 Feb 2024 13:56:04 +0200 Subject: [PATCH 073/237] Implement faster w16 and w32. The speed of new w32 is mostly same as before though. --- src/strategies/avx2/intra-avx2.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index df75ed4d..eb3c9efb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1397,12 +1397,6 @@ static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* re // Height has to be at least 2, handle 2 lines at once for (int y = 0; y < height; y += 2) { - /*int8_t tmp[2] = {32 - delta_fract[y + 0], delta_fract[y + 0]}; - int16_t coeff_tmp0 = *(int16_t*)tmp; - tmp[0] = 32 - delta_fract[y + 1]; - tmp[1] = delta_fract[y + 1]; - int16_t coeff_tmp1 = *(int16_t*)tmp;*/ - const int16_t* coeff_tmp0 = (const int16_t*) &intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1) + 0]; const int16_t* coeff_tmp1 = (const int16_t*) &intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1) + 2]; @@ -1428,7 +1422,7 @@ static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const __m128i v16s = _mm_set1_epi16(16); const __m128i vshuf = _mm_setr_epi8( @@ -1436,11 +1430,13 @@ static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* r 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); + const int mode_idx = (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int coeff_table_offset = mode_idx * 64; + // Handle 1 line at a time for (int y = 0; y < height; ++y) { - int8_t tmp0[2] = { 32 - delta_fract[y + 0], delta_fract[y + 0]}; - int16_t coeff_tmp = *(int16_t*)tmp0; - __m128i vcoeff = _mm_set1_epi16(coeff_tmp); + const int16_t* coeff_tmp = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1)]; + __m128i vcoeff = _mm_set1_epi16(*coeff_tmp); __m128i vsrc0 = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + 0 + 1]); __m128i vsrc1 = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + 8 + 1]); @@ -1461,7 +1457,7 @@ static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const __m256i v16s = _mm256_set1_epi16(16); const __m256i vshuf = _mm256_setr_epi8( @@ -1471,11 +1467,13 @@ static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* r 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); + const int mode_idx = (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int coeff_table_offset = mode_idx * 64; + // Handle 1 line at a time for (int y = 0; y < height; ++y) { - int8_t tmp0[2] = { 32 - delta_fract[y + 0], delta_fract[y + 0] }; - int16_t coeff_tmp = *(int16_t*)tmp0; - __m256i vcoeff = _mm256_set1_epi16(coeff_tmp); + const int16_t* coeff_tmp = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1)]; + __m256i vcoeff = _mm256_set1_epi16(*coeff_tmp); __m128i vsrc[4]; vsrc[0] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 0 + 1]); @@ -2401,8 +2399,8 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, wide_angle_mode, pred_mode); break; case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, delta_fract); break; - case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, pred_mode); break; + case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, pred_mode); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From 04dde565373a3a652b10f4f9384831f89a6aba8e Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 9 Feb 2024 00:54:41 +0200 Subject: [PATCH 074/237] Separate non-fractional angle pixel copy functions to vertical and horizontal. Horizontal is a generic placeholder until I figure out something better. --- src/strategies/avx2/intra-avx2.c | 108 ++++++++++++++++++++++++++++--- 1 file changed, 98 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index eb3c9efb..0cda6b6f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1335,12 +1335,12 @@ static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, c // Linear interpolation filter for width 4 has a different call, since it uses premade tables for coefficients -static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const bool wide_angle_mode, const int32_t pred_mode) +static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int32_t pred_mode) { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); - const int mode_idx = wide_angle_mode ? 0 : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int mode_idx = pred_mode <= 34 ? pred_mode - 2 : 66 - pred_mode; const int weight_table_offset = coeff_table_mode_offsets[mode_idx]; const int vnum = coeff_vector128_num_by_mode[mode_idx]; const int modulo = vnum - 1; @@ -1668,6 +1668,52 @@ static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* r } +static void angular_pred_avx2_linear_filter_w4_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 4; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + // Height has to be at least 4, handle 4 lines at once + for (int y = 0; y < height; y += 4) { + uvg_pixel src[32]; + int16_t coeff_tmp[4]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int yy = 0; yy < 4; ++yy) { + src[yy * 8 + 0] = ref[dint[yy] + 1 + 0]; + src[yy * 8 + 1] = ref[dint[yy] + 1 + 1]; + src[yy * 8 + 2] = ref[dint[yy] + 1 + 1]; + src[yy * 8 + 3] = ref[dint[yy] + 1 + 2]; + src[yy * 8 + 4] = ref[dint[yy] + 1 + 2]; + src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; + src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; + src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; + int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; + coeff_tmp[yy] = *(int16_t*)tmp; + } + dint += 4; + + const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], + coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); + const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], + coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } + +} + + static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { // 2-tap linear filter @@ -1682,8 +1728,9 @@ static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, } -static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) +static void angular_pred_avx2_non_fractional_angle_pxl_copy_ver(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { + // Note: this probably won't work for wide angle modes. for (int y = 0; y < height; ++y) { uvg_pixel* dst_row = dst + y * width; uvg_pixel* ref_row = ref + delta_int[y] + 1; @@ -1697,6 +1744,16 @@ static void angular_pred_avx2_non_fractional_angle_pxl_copy(uvg_pixel* dst, uvg_ } } +static void angular_pred_avx2_non_fractional_angle_pxl_copy_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) +{ + // TODO: replace this generic solution after testing + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + dst[y * width + x] = ref[delta_int[x] + y + 1]; + } + } +} + static void angular_pdpc_ver_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { @@ -2395,19 +2452,44 @@ static void uvg_angular_pred_avx2( // Chroma channels else { // Do 2-tap linear filtering for chroma channels - if (vertical_mode) { - switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, wide_angle_mode, pred_mode); break; + if (wide_angle_mode) { + if (vertical_mode) { + switch (width) { + case 4: angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 8: break; + case 16: break; + case 32: break; + default: + assert(false && "Intra angular predicion: illegal chroma width.\n"); + break; + } + } + else { + switch (width) { + case 4: break; + case 8: break; + case 16: break; + case 32: break; + default: + assert(false && "Intra angular predicion: illegal chroma width.\n"); + break; + } + } + } + else { + if (vertical_mode) { + switch (width) { + case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, pred_mode); break; case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, pred_mode); break; case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, pred_mode); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; + } } - } - else { - switch (width) { + else { + switch (width) { case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, pred_mode, delta_int); break; case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, pred_mode, delta_int); break; @@ -2415,13 +2497,19 @@ static void uvg_angular_pred_avx2( default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; + } } } } } else { // No interpolation or filtering needed, just copy the integer samples - angular_pred_avx2_non_fractional_angle_pxl_copy(dst, ref_main, width, height, delta_int); + if (vertical_mode) { + angular_pred_avx2_non_fractional_angle_pxl_copy_ver(dst, ref_main, width, height, delta_int); + } + else { + angular_pred_avx2_non_fractional_angle_pxl_copy_hor(dst, ref_main, width, height, delta_int); + } } } else { From b99ba063c4590c7392c5f6bf1ebf232c903f873d Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 13 Feb 2024 23:13:37 +0200 Subject: [PATCH 075/237] Implement vertical chroma linear interpolation functions for wide angles. The memory management is placeholder, replace with a faster version later. --- src/strategies/avx2/intra-avx2.c | 117 ++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0cda6b6f..de89567d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1710,7 +1710,118 @@ static void angular_pred_avx2_linear_filter_w4_ver_wide_angle(uvg_pixel* dst, uv _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); dst += 16; } +} + + +static void angular_pred_avx2_linear_filter_w8_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 8; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + // Height has to be at least 2, handle 2 lines at once + for (int y = 0; y < height; y += 2) { + uvg_pixel src[32]; + int16_t coeff_tmp[2]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int yy = 0; yy < 2; ++yy) { + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[yy * 16 + d + 0] = ref[dint[yy] + 1 + x + 0]; + src[yy * 16 + d + 1] = ref[dint[yy] + 1 + x + 1]; + } + int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; + coeff_tmp[yy] = *(int16_t*)tmp; + } + dint += 2; + + const __m128i vcoeff0 = _mm_set1_epi16(coeff_tmp[0]); + const __m128i vcoeff1 = _mm_set1_epi16(coeff_tmp[1]); + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w16_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 16; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + // Height has to be at least 2, handle 1 line at a time + for (int y = 0; y < height; ++y) { + uvg_pixel src[32]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[d + 0] = ref[*dint + 1 + x + 0]; + src[d + 1] = ref[*dint + 1 + x + 1]; + } + dint++; + + int8_t tmp[2] = { 32 - delta_fract[y], delta_fract[y] }; + const int16_t coeff_tmp = *(int16_t*)tmp; + const __m128i vcoeff = _mm_set1_epi16(coeff_tmp); + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w32_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 32; + const int16_t* dint = delta_int; + const __m256i v16s = _mm256_set1_epi16(16); + // Height has to be at least 2, handle 1 line at a time + for (int y = 0; y < height; ++y) { + uvg_pixel src[64]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[d + 0] = ref[*dint + 1 + x + 0]; + src[d + 1] = ref[*dint + 1 + x + 1]; + } + dint++; + + int8_t tmp[2] = { 32 - delta_fract[y], delta_fract[y] }; + const int16_t coeff_tmp = *(int16_t*)tmp; + const __m256i vcoeff = _mm256_set1_epi16(coeff_tmp); + + const __m256i* vsrc0 = (const __m256i*) & src[0]; + const __m256i* vsrc1 = (const __m256i*) & src[32]; + + __m256i res0 = _mm256_maddubs_epi16(*vsrc0, vcoeff); + __m256i res1 = _mm256_maddubs_epi16(*vsrc1, vcoeff); + res0 = _mm256_add_epi16(res0, v16s); + res1 = _mm256_add_epi16(res1, v16s); + res0 = _mm256_srai_epi16(res0, 5); + res1 = _mm256_srai_epi16(res1, 5); + + __m256i vfinal = _mm256_packus_epi16(res0, res1); + vfinal = _mm256_permute4x64_epi64(vfinal, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, vfinal); + dst += 32; + } } @@ -2456,9 +2567,9 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 8: break; - case 16: break; - case 32: break; + case 8: angular_pred_avx2_linear_filter_w8_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w32_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From c40fc96929269c5cab97cee558d2ded9ce934924 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 14 Feb 2024 00:06:14 +0200 Subject: [PATCH 076/237] Implement horizontal chroma linear interpolation functions for wide angles. Placeholder memory management. w32 is omitted, there is a possibility it will never be called. --- src/strategies/avx2/intra-avx2.c | 138 +++++++++++++++++++++++++++++-- 1 file changed, 133 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index de89567d..db2222e5 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1825,6 +1825,134 @@ static void angular_pred_avx2_linear_filter_w32_ver_wide_angle(uvg_pixel* dst, u } +static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 4; + const __m128i v16s = _mm_set1_epi16(16); + + int16_t coeff_tmp[4]; + for (int x = 0; x < width; ++x) { + int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; + coeff_tmp[x] = *(int16_t*)tmp; + } + + const __m128i vcoeff = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3], + coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3]); + + // Height has to be at least 4, handle 4 lines at once + for (int y = 0; y < height; y += 4) { + uvg_pixel src[32]; + + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[d + 0] = ref[delta_int[x] + y + 1 + 0]; + src[d + 1] = ref[delta_int[x] + y + 1 + 1]; + src[d + 8] = ref[delta_int[x] + y + 2 + 0]; + src[d + 9] = ref[delta_int[x] + y + 2 + 1]; + src[d + 16] = ref[delta_int[x] + y + 3 + 0]; + src[d + 17] = ref[delta_int[x] + y + 3 + 1]; + src[d + 24] = ref[delta_int[x] + y + 4 + 0]; + src[d + 25] = ref[delta_int[x] + y + 4 + 1]; + } + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 8; + const __m128i v16s = _mm_set1_epi16(16); + + int16_t coeff_tmp[8]; + for (int x = 0; x < width; ++x) { + int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; + coeff_tmp[x] = *(int16_t*)tmp; + } + + const __m128i vcoeff = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3], + coeff_tmp[4], coeff_tmp[5], coeff_tmp[6], coeff_tmp[7]); + + // Height has to be at least 2, handle 2 lines at once + for (int y = 0; y < height; y += 2) { + uvg_pixel src[32]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[d + 0] = ref[delta_int[x] + y + 1 + 0]; + src[d + 1] = ref[delta_int[x] + y + 1 + 1]; + src[d + 16] = ref[delta_int[x] + y + 2 + 0]; + src[d + 17] = ref[delta_int[x] + y + 2 + 1]; + } + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + +static void angular_pred_avx2_linear_filter_w16_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 16; + const __m128i v16s = _mm_set1_epi16(16); + + int16_t coeff_tmp[16]; + for (int x = 0; x < width; ++x) { + int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; + coeff_tmp[x] = *(int16_t*)tmp; + } + + const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3], + coeff_tmp[4], coeff_tmp[5], coeff_tmp[6], coeff_tmp[7]); + const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[ 8], coeff_tmp[ 9], coeff_tmp[10], coeff_tmp[11], + coeff_tmp[12], coeff_tmp[13], coeff_tmp[14], coeff_tmp[15]); + + // Height has to be at least 2, handle 1 line at a time + for (int y = 0; y < height; ++y) { + uvg_pixel src[32]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int x = 0, d = 0; x < width; ++x, d += 2) { + src[d + 0] = ref[delta_int[x] + y + 1 + 0]; + src[d + 1] = ref[delta_int[x] + y + 1 + 1]; + } + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { // 2-tap linear filter @@ -2569,7 +2697,7 @@ static void uvg_angular_pred_avx2( case 4: angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 8: angular_pred_avx2_linear_filter_w8_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 32: angular_pred_avx2_linear_filter_w32_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w32_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; // This may never get reached. default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; @@ -2577,10 +2705,10 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: break; - case 8: break; - case 16: break; - case 32: break; + case 4: angular_pred_avx2_linear_filter_w4_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 8: angular_pred_avx2_linear_filter_w8_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 16: angular_pred_avx2_linear_filter_w16_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 32: break; // This may never get reached. default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From 4292c64229fe0a144ef201cfccf037f3e54de31c Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 14 Feb 2024 15:17:09 +0200 Subject: [PATCH 077/237] Fix error with sample_disp. Add assert to code branch which is not executed with YUV420 pixel format. --- src/strategies/avx2/intra-avx2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index db2222e5..0e0ae03f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2577,7 +2577,7 @@ static void uvg_angular_pred_avx2( const bool wide_angle_mode = mode_disp > 16; // Sample displacement per column in fractions of 32. - const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; + const int_fast16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; const int side_size = vertical_mode ? log2_height : log2_width; int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]); @@ -2697,7 +2697,7 @@ static void uvg_angular_pred_avx2( case 4: angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 8: angular_pred_avx2_linear_filter_w8_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 32: angular_pred_avx2_linear_filter_w32_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; // This may never get reached. + case 32: angular_pred_avx2_linear_filter_w32_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; @@ -2708,7 +2708,7 @@ static void uvg_angular_pred_avx2( case 4: angular_pred_avx2_linear_filter_w4_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 8: angular_pred_avx2_linear_filter_w8_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 32: break; // This may never get reached. + case 32: assert(false && "This code branch only works with UVG_FORMAT_P420."); break; // This branch is never executed with UVG_FORMAT_P420, due to chroma being only 32 width or height default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From 24e19665cab8a4f7781f225401ed8046eaf3b726 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 14 Feb 2024 17:15:53 +0200 Subject: [PATCH 078/237] Add intra wide angle tables to python script. --- src/strategies/avx2/intra-avx2.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0e0ae03f..230a1fe6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -263,7 +263,6 @@ ALIGNED(32) static const int16_t delta_int_wide_angle_table[1792] = { 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, // 79 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, - }; // TODO: cut out the latter 32 entries due to symmetry. Also, cut in half due to vertical symmetry From 7eda0e0075d92c789a655100dc1c81328b310e3c Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 15 Feb 2024 17:37:53 +0200 Subject: [PATCH 079/237] Implement wide angles for w8, w16 and w32 vertical. --- src/strategies/avx2/intra-avx2.c | 115 ++++++++++----- src/strategies/avx2/intra_avx2_tables.h | 178 +++++++++++++++++++++++- 2 files changed, 257 insertions(+), 36 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 230a1fe6..8a87df30 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1390,14 +1390,15 @@ static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* re 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); - - const int mode_idx = (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int wide_angle = pred_mode > 66 || pred_mode < 2; + const int mode_idx = wide_angle ? (pred_mode < 2 ? 12 + pred_mode : 80 - pred_mode) : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int8_t* coeff_table = wide_angle ? intra_chroma_linear_interpolation_weights_w8_ver_wide_angle : intra_chroma_linear_interpolation_weights_w8_ver; const int coeff_table_offset = mode_idx * 64; // Height has to be at least 2, handle 2 lines at once for (int y = 0; y < height; y += 2) { - const int16_t* coeff_tmp0 = (const int16_t*) &intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1) + 0]; - const int16_t* coeff_tmp1 = (const int16_t*) &intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1) + 2]; + const int16_t* coeff_tmp0 = (const int16_t*) &coeff_table[coeff_table_offset + (y << 1) + 0]; + const int16_t* coeff_tmp1 = (const int16_t*) &coeff_table[coeff_table_offset + (y << 1) + 2]; __m128i vsrc0 = _mm_loadu_si128((const __m128i*) & ref[delta_int[y + 0] + 1]); __m128i vsrc1 = _mm_loadu_si128((const __m128i*) & ref[delta_int[y + 1] + 1]); @@ -1429,12 +1430,14 @@ static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* r 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); - const int mode_idx = (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int wide_angle = pred_mode > 66 || pred_mode < 2; + const int mode_idx = wide_angle ? (pred_mode < 2 ? 12 + pred_mode : 80 - pred_mode) : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int8_t* coeff_table = wide_angle ? intra_chroma_linear_interpolation_weights_w8_ver_wide_angle : intra_chroma_linear_interpolation_weights_w8_ver; const int coeff_table_offset = mode_idx * 64; // Handle 1 line at a time for (int y = 0; y < height; ++y) { - const int16_t* coeff_tmp = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1)]; + const int16_t* coeff_tmp = (const int16_t*)&coeff_table[coeff_table_offset + (y << 1)]; __m128i vcoeff = _mm_set1_epi16(*coeff_tmp); __m128i vsrc0 = _mm_loadu_si128((const __m128i*)&ref[delta_int[y] + 0 + 1]); @@ -1466,12 +1469,14 @@ static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* r 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); - const int mode_idx = (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int wide_angle = pred_mode > 66 || pred_mode < 2; + const int mode_idx = wide_angle ? (pred_mode < 2 ? 12 + pred_mode : 80 - pred_mode) : (pred_mode <= 34 ? (pred_mode - 2) : (66 - pred_mode)); + const int8_t* coeff_table = wide_angle ? intra_chroma_linear_interpolation_weights_w8_ver_wide_angle : intra_chroma_linear_interpolation_weights_w8_ver; const int coeff_table_offset = mode_idx * 64; // Handle 1 line at a time for (int y = 0; y < height; ++y) { - const int16_t* coeff_tmp = (const int16_t*)&intra_chroma_linear_interpolation_weights_w8_ver[coeff_table_offset + (y << 1)]; + const int16_t* coeff_tmp = (const int16_t*)&coeff_table[coeff_table_offset + (y << 1)]; __m256i vcoeff = _mm256_set1_epi16(*coeff_tmp); __m128i vsrc[4]; @@ -1667,6 +1672,7 @@ static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* r } +// Can handle most of the wide angle modes. static void angular_pred_avx2_linear_filter_w4_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 4; @@ -1712,6 +1718,52 @@ static void angular_pred_avx2_linear_filter_w4_ver_wide_angle(uvg_pixel* dst, uv } +// Handles the extreme wide angle modes, as these need a different memory load pattern. +static void angular_pred_avx2_linear_filter_w4_ver_extreme_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +{ + const int width = 4; + const int16_t* dint = delta_int; + const __m128i v16s = _mm_set1_epi16(16); + // Height has to be at least 4, handle 4 lines at once + for (int y = 0; y < height; y += 4) { + uvg_pixel src[32]; + int16_t coeff_tmp[4]; + // TODO: get rid of this slow crap, this is just here to test the calculations + for (int yy = 0; yy < 4; ++yy) { + src[yy * 8 + 0] = ref[dint[yy] + 1 + 0]; + src[yy * 8 + 1] = ref[dint[yy] + 1 + 1]; + src[yy * 8 + 2] = ref[dint[yy] + 1 + 1]; + src[yy * 8 + 3] = ref[dint[yy] + 1 + 2]; + src[yy * 8 + 4] = ref[dint[yy] + 1 + 2]; + src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; + src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; + src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; + int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; + coeff_tmp[yy] = *(int16_t*)tmp; + } + dint += 4; + + const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], + coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); + const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], + coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); + + const __m128i* vsrc0 = (const __m128i*) & src[0]; + const __m128i* vsrc1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + + _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + dst += 16; + } +} + + static void angular_pred_avx2_linear_filter_w8_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 8; @@ -2653,7 +2705,7 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 12) * 64 : (pred_mode - 67 + 14) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); + const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 12) * 64 : (80 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); const int16_t* delta_int = wide_angle_mode ? &delta_int_wide_angle_table[table_offset] : &delta_int_table[table_offset]; const int16_t* delta_fract = wide_angle_mode ? &delta_fract_wide_angle_table[table_offset] : &delta_fract_table[table_offset]; @@ -2690,37 +2742,30 @@ static void uvg_angular_pred_avx2( // Chroma channels else { // Do 2-tap linear filtering for chroma channels - if (wide_angle_mode) { - if (vertical_mode) { - switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 8: angular_pred_avx2_linear_filter_w8_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 16: angular_pred_avx2_linear_filter_w16_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 32: angular_pred_avx2_linear_filter_w32_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - default: - assert(false && "Intra angular predicion: illegal chroma width.\n"); - break; - } + + if (vertical_mode) { + switch (width) { + case 4: + if (wide_angle_mode) + angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); + else + angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, pred_mode); + break; + case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; + case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, pred_mode); break; + case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, pred_mode); break; + default: + assert(false && "Intra angular predicion: illegal chroma width.\n"); + break; } - else { + } + else { + if (wide_angle_mode) { switch (width) { case 4: angular_pred_avx2_linear_filter_w4_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 8: angular_pred_avx2_linear_filter_w8_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 32: assert(false && "This code branch only works with UVG_FORMAT_P420."); break; // This branch is never executed with UVG_FORMAT_P420, due to chroma being only 32 width or height - default: - assert(false && "Intra angular predicion: illegal chroma width.\n"); - break; - } - } - } - else { - if (vertical_mode) { - switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, pred_mode); break; + case 32: assert(false && "This code branch only works with UVG_FORMAT_P420."); break; // This branch is never executed with UVG_FORMAT_P420, due to chroma being only 32 width or height. default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 3ae01f98..029e6aec 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -19,12 +19,17 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m30_coeff[] = { 20, 12, 8, 24, 28, 04, 16, 16, 20, 12, 8, 24, 28, 04, 16, 16, }; + // The number of unique 128-bit coefficient vectors for a given prediction mode. Applicable for width 4 chroma linear interpolation. -const int8_t coeff_vector128_num_by_mode[33] = { +ALIGNED(32) const int8_t coeff_vector128_num_by_mode[33] = { 1, 16, 8, 16, 4, 8, 1, 8, 4, 8, 2, 8, 4, 16, 8, 16, 1, 16, 8, 16, 4, 8, 2, 8, 4, 8, 1, 8, 4, 16, 8, 16, 1 }; +ALIGNED(32) const int8_t coeff_vector128_num_by_mode_wide_angle[14] = { + 1, 16, 1, 16, 1, 8, 8, 16, 1, 16, 16, 16, 16, 16 +}; + const int16_t coeff_table_mode_offsets[33] = { 0, 16, 272, 400, 656, 720, 848, 864, 992, 1056, 1184, 1216, 1344, 1408, 1664, 1792, @@ -510,6 +515,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_h }; +// Chroma linear interpolation filter weights for width 8, vertical modes. These also work for w16 and w32. ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 @@ -546,6 +552,23 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 }; +// Chroma linear interpolation filter weights for width 8, vertical wide angle modes. These also work for w16 and w32. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver_wide_angle[] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 Offset 0 + 11, 21, 22, 10, 1, 31, 12, 20, 23, 9, 2, 30, 13, 19, 24, 8, 3, 29, 14, 18, 25, 7, 4, 28, 15, 17, 26, 6, 5, 27, 16, 16, 27, 5, 6, 26, 17, 15, 28, 4, 7, 25, 18, 14, 29, 3, 8, 24, 19, 13, 30, 2, 9, 23, 20, 12, 31, 1, 10, 22, 21, 11, 32, 0, // Mode -11 Offset 64 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 Offset 128 + 21, 11, 10, 22, 31, 1, 20, 12, 9, 23, 30, 2, 19, 13, 8, 24, 29, 3, 18, 14, 7, 25, 28, 4, 17, 15, 6, 26, 27, 5, 16, 16, 5, 27, 26, 6, 15, 17, 4, 28, 25, 7, 14, 18, 3, 29, 24, 8, 13, 19, 2, 30, 23, 9, 12, 20, 1, 31, 22, 10, 11, 21, 32, 0, // Mode -9 Offset 192 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -8 Offset 256 + 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, 26, 6, 20, 12, 14, 18, 8, 24, 2, 30, 28, 4, 22, 10, 16, 16, 10, 22, 4, 28, 30, 2, 24, 8, 18, 14, 12, 20, 6, 26, 32, 0, // Mode -7 Offset 320 + 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, 10, 22, 20, 12, 30, 2, 8, 24, 18, 14, 28, 4, 6, 26, 16, 16, 26, 6, 4, 28, 14, 18, 24, 8, 2, 30, 12, 20, 22, 10, 32, 0, // Mode -6 Offset 384 + 23, 9, 14, 18, 5, 27, 28, 4, 19, 13, 10, 22, 1, 31, 24, 8, 15, 17, 6, 26, 29, 3, 20, 12, 11, 21, 2, 30, 25, 7, 16, 16, 7, 25, 30, 2, 21, 11, 12, 20, 3, 29, 26, 6, 17, 15, 8, 24, 31, 1, 22, 10, 13, 19, 4, 28, 27, 5, 18, 14, 9, 23, 32, 0, // Mode -5 Offset 448 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -4 Offset 512 + 7, 25, 14, 18, 21, 11, 28, 4, 3, 29, 10, 22, 17, 15, 24, 8, 31, 1, 6, 26, 13, 19, 20, 12, 27, 5, 2, 30, 9, 23, 16, 16, 23, 9, 30, 2, 5, 27, 12, 20, 19, 13, 26, 6, 1, 31, 8, 24, 15, 17, 22, 10, 29, 3, 4, 28, 11, 21, 18, 14, 25, 7, 32, 0, // Mode -3 Offset 576 + 13, 19, 26, 6, 7, 25, 20, 12, 1, 31, 14, 18, 27, 5, 8, 24, 21, 11, 2, 30, 15, 17, 28, 4, 9, 23, 22, 10, 3, 29, 16, 16, 29, 3, 10, 22, 23, 9, 4, 28, 17, 15, 30, 2, 11, 21, 24, 8, 5, 27, 18, 14, 31, 1, 12, 20, 25, 7, 6, 26, 19, 13, 32, 0, // Mode -2 Offset 640 + 19, 13, 6, 26, 25, 7, 12, 20, 31, 1, 18, 14, 5, 27, 24, 8, 11, 21, 30, 2, 17, 15, 4, 28, 23, 9, 10, 22, 29, 3, 16, 16, 3, 29, 22, 10, 9, 23, 28, 4, 15, 17, 2, 30, 21, 11, 8, 24, 27, 5, 14, 18, 1, 31, 20, 12, 7, 25, 26, 6, 13, 19, 32, 0, // Mode -1 Offset 704 + 25, 7, 18, 14, 11, 21, 4, 28, 29, 3, 22, 10, 15, 17, 8, 24, 1, 31, 26, 6, 19, 13, 12, 20, 5, 27, 30, 2, 23, 9, 16, 16, 9, 23, 2, 30, 27, 5, 20, 12, 13, 19, 6, 26, 31, 1, 24, 8, 17, 15, 10, 22, 3, 29, 28, 4, 21, 11, 14, 18, 7, 25, 32, 0, // Mode 0 Offset 768 + 29, 3, 26, 6, 23, 9, 20, 12, 17, 15, 14, 18, 11, 21, 8, 24, 5, 27, 2, 30, 31, 1, 28, 4, 25, 7, 22, 10, 19, 13, 16, 16, 13, 19, 10, 22, 7, 25, 4, 28, 1, 31, 30, 2, 27, 5, 24, 8, 21, 11, 18, 14, 15, 17, 12, 20, 9, 23, 6, 26, 3, 29, 32, 0, // Mode 1 Offset 832 +}; // Chroma linear interpolation filter weights for width 4, horizontal modes ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { @@ -960,4 +983,157 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 34 Offset 4096 }; + +// Chroma linear interpolation filter weights for width 4, wide angle vertical modes. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver_wide_angle[2368] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 Offset 0 + 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, // Mode -11 Offset 16 + 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, + 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, + 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, + 3, 29, 3, 29, 3, 29, 3, 29, 14, 18, 14, 18, 14, 18, 14, 18, + 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, + 15, 17, 15, 17, 15, 17, 15, 17, 26, 6, 26, 6, 26, 6, 26, 6, + 5, 27, 5, 27, 5, 27, 5, 27, 16, 16, 16, 16, 16, 16, 16, 16, + 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, + 17, 15, 17, 15, 17, 15, 17, 15, 28, 4, 28, 4, 28, 4, 28, 4, + 7, 25, 7, 25, 7, 25, 7, 25, 18, 14, 18, 14, 18, 14, 18, 14, + 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, + 19, 13, 19, 13, 19, 13, 19, 13, 30, 2, 30, 2, 30, 2, 30, 2, + 9, 23, 9, 23, 9, 23, 9, 23, 20, 12, 20, 12, 20, 12, 20, 12, + 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, + 21, 11, 21, 11, 21, 11, 21, 11, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 Offset 272 + 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, // Mode -9 Offset 288 + 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, + 9, 23, 9, 23, 9, 23, 9, 23, 30, 2, 30, 2, 30, 2, 30, 2, + 19, 13, 19, 13, 19, 13, 19, 13, 8, 24, 8, 24, 8, 24, 8, 24, + 29, 3, 29, 3, 29, 3, 29, 3, 18, 14, 18, 14, 18, 14, 18, 14, + 7, 25, 7, 25, 7, 25, 7, 25, 28, 4, 28, 4, 28, 4, 28, 4, + 17, 15, 17, 15, 17, 15, 17, 15, 6, 26, 6, 26, 6, 26, 6, 26, + 27, 5, 27, 5, 27, 5, 27, 5, 16, 16, 16, 16, 16, 16, 16, 16, + 5, 27, 5, 27, 5, 27, 5, 27, 26, 6, 26, 6, 26, 6, 26, 6, + 15, 17, 15, 17, 15, 17, 15, 17, 4, 28, 4, 28, 4, 28, 4, 28, + 25, 7, 25, 7, 25, 7, 25, 7, 14, 18, 14, 18, 14, 18, 14, 18, + 3, 29, 3, 29, 3, 29, 3, 29, 24, 8, 24, 8, 24, 8, 24, 8, + 13, 19, 13, 19, 13, 19, 13, 19, 2, 30, 2, 30, 2, 30, 2, 30, + 23, 9, 23, 9, 23, 9, 23, 9, 12, 20, 12, 20, 12, 20, 12, 20, + 1, 31, 1, 31, 1, 31, 1, 31, 22, 10, 22, 10, 22, 10, 22, 10, + 11, 21, 11, 21, 11, 21, 11, 21, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -8 Offset 544 + 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -7 Offset 560 + 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, + 22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, + 10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, + 30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, + 18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, + 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, + 10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -6 Offset 688 + 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, + 18, 14, 18, 14, 18, 14, 18, 14, 28, 4, 28, 4, 28, 4, 28, 4, + 6, 26, 6, 26, 6, 26, 6, 26, 16, 16, 16, 16, 16, 16, 16, 16, + 26, 6, 26, 6, 26, 6, 26, 6, 4, 28, 4, 28, 4, 28, 4, 28, + 14, 18, 14, 18, 14, 18, 14, 18, 24, 8, 24, 8, 24, 8, 24, 8, + 2, 30, 2, 30, 2, 30, 2, 30, 12, 20, 12, 20, 12, 20, 12, 20, + 22, 10, 22, 10, 22, 10, 22, 10, 32, 0, 32, 0, 32, 0, 32, 0, + 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode -5 Offset 816 + 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, + 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, + 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, + 15, 17, 15, 17, 15, 17, 15, 17, 6, 26, 6, 26, 6, 26, 6, 26, + 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, + 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, + 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, + 7, 25, 7, 25, 7, 25, 7, 25, 30, 2, 30, 2, 30, 2, 30, 2, + 21, 11, 21, 11, 21, 11, 21, 11, 12, 20, 12, 20, 12, 20, 12, 20, + 3, 29, 3, 29, 3, 29, 3, 29, 26, 6, 26, 6, 26, 6, 26, 6, + 17, 15, 17, 15, 17, 15, 17, 15, 8, 24, 8, 24, 8, 24, 8, 24, + 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, + 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, + 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, + 9, 23, 9, 23, 9, 23, 9, 23, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -4 Offset 1072 + 7, 25, 7, 25, 7, 25, 7, 25, 14, 18, 14, 18, 14, 18, 14, 18, // Mode -3 Offset 1088 + 21, 11, 21, 11, 21, 11, 21, 11, 28, 4, 28, 4, 28, 4, 28, 4, + 3, 29, 3, 29, 3, 29, 3, 29, 10, 22, 10, 22, 10, 22, 10, 22, + 17, 15, 17, 15, 17, 15, 17, 15, 24, 8, 24, 8, 24, 8, 24, 8, + 31, 1, 31, 1, 31, 1, 31, 1, 6, 26, 6, 26, 6, 26, 6, 26, + 13, 19, 13, 19, 13, 19, 13, 19, 20, 12, 20, 12, 20, 12, 20, 12, + 27, 5, 27, 5, 27, 5, 27, 5, 2, 30, 2, 30, 2, 30, 2, 30, + 9, 23, 9, 23, 9, 23, 9, 23, 16, 16, 16, 16, 16, 16, 16, 16, + 23, 9, 23, 9, 23, 9, 23, 9, 30, 2, 30, 2, 30, 2, 30, 2, + 5, 27, 5, 27, 5, 27, 5, 27, 12, 20, 12, 20, 12, 20, 12, 20, + 19, 13, 19, 13, 19, 13, 19, 13, 26, 6, 26, 6, 26, 6, 26, 6, + 1, 31, 1, 31, 1, 31, 1, 31, 8, 24, 8, 24, 8, 24, 8, 24, + 15, 17, 15, 17, 15, 17, 15, 17, 22, 10, 22, 10, 22, 10, 22, 10, + 29, 3, 29, 3, 29, 3, 29, 3, 4, 28, 4, 28, 4, 28, 4, 28, + 11, 21, 11, 21, 11, 21, 11, 21, 18, 14, 18, 14, 18, 14, 18, 14, + 25, 7, 25, 7, 25, 7, 25, 7, 32, 0, 32, 0, 32, 0, 32, 0, + 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, // Mode -2 Offset 1344 + 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, + 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, + 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, + 21, 11, 21, 11, 21, 11, 21, 11, 2, 30, 2, 30, 2, 30, 2, 30, + 15, 17, 15, 17, 15, 17, 15, 17, 28, 4, 28, 4, 28, 4, 28, 4, + 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, + 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, + 29, 3, 29, 3, 29, 3, 29, 3, 10, 22, 10, 22, 10, 22, 10, 22, + 23, 9, 23, 9, 23, 9, 23, 9, 4, 28, 4, 28, 4, 28, 4, 28, + 17, 15, 17, 15, 17, 15, 17, 15, 30, 2, 30, 2, 30, 2, 30, 2, + 11, 21, 11, 21, 11, 21, 11, 21, 24, 8, 24, 8, 24, 8, 24, 8, + 5, 27, 5, 27, 5, 27, 5, 27, 18, 14, 18, 14, 18, 14, 18, 14, + 31, 1, 31, 1, 31, 1, 31, 1, 12, 20, 12, 20, 12, 20, 12, 20, + 25, 7, 25, 7, 25, 7, 25, 7, 6, 26, 6, 26, 6, 26, 6, 26, + 19, 13, 19, 13, 19, 13, 19, 13, 32, 0, 32, 0, 32, 0, 32, 0, + 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, // Mode -1 Offset 1600 + 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, + 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, + 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, + 11, 21, 11, 21, 11, 21, 11, 21, 30, 2, 30, 2, 30, 2, 30, 2, + 17, 15, 17, 15, 17, 15, 17, 15, 4, 28, 4, 28, 4, 28, 4, 28, + 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, + 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, + 3, 29, 3, 29, 3, 29, 3, 29, 22, 10, 22, 10, 22, 10, 22, 10, + 9, 23, 9, 23, 9, 23, 9, 23, 28, 4, 28, 4, 28, 4, 28, 4, + 15, 17, 15, 17, 15, 17, 15, 17, 2, 30, 2, 30, 2, 30, 2, 30, + 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, + 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, + 1, 31, 1, 31, 1, 31, 1, 31, 20, 12, 20, 12, 20, 12, 20, 12, + 7, 25, 7, 25, 7, 25, 7, 25, 26, 6, 26, 6, 26, 6, 26, 6, + 13, 19, 13, 19, 13, 19, 13, 19, 32, 0, 32, 0, 32, 0, 32, 0, + 25, 7, 25, 7, 25, 7, 25, 7, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 0 Offset 1856 + 11, 21, 11, 21, 11, 21, 11, 21, 4, 28, 4, 28, 4, 28, 4, 28, + 29, 3, 29, 3, 29, 3, 29, 3, 22, 10, 22, 10, 22, 10, 22, 10, + 15, 17, 15, 17, 15, 17, 15, 17, 8, 24, 8, 24, 8, 24, 8, 24, + 1, 31, 1, 31, 1, 31, 1, 31, 26, 6, 26, 6, 26, 6, 26, 6, + 19, 13, 19, 13, 19, 13, 19, 13, 12, 20, 12, 20, 12, 20, 12, 20, + 5, 27, 5, 27, 5, 27, 5, 27, 30, 2, 30, 2, 30, 2, 30, 2, + 23, 9, 23, 9, 23, 9, 23, 9, 16, 16, 16, 16, 16, 16, 16, 16, + 9, 23, 9, 23, 9, 23, 9, 23, 2, 30, 2, 30, 2, 30, 2, 30, + 27, 5, 27, 5, 27, 5, 27, 5, 20, 12, 20, 12, 20, 12, 20, 12, + 13, 19, 13, 19, 13, 19, 13, 19, 6, 26, 6, 26, 6, 26, 6, 26, + 31, 1, 31, 1, 31, 1, 31, 1, 24, 8, 24, 8, 24, 8, 24, 8, + 17, 15, 17, 15, 17, 15, 17, 15, 10, 22, 10, 22, 10, 22, 10, 22, + 3, 29, 3, 29, 3, 29, 3, 29, 28, 4, 28, 4, 28, 4, 28, 4, + 21, 11, 21, 11, 21, 11, 21, 11, 14, 18, 14, 18, 14, 18, 14, 18, + 7, 25, 7, 25, 7, 25, 7, 25, 32, 0, 32, 0, 32, 0, 32, 0, + 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 1 Offset 2112 + 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, + 17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, + 11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, + 31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, + 25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, + 19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, + 13, 19, 13, 19, 13, 19, 13, 19, 10, 22, 10, 22, 10, 22, 10, 22, + 7, 25, 7, 25, 7, 25, 7, 25, 4, 28, 4, 28, 4, 28, 4, 28, + 1, 31, 1, 31, 1, 31, 1, 31, 30, 2, 30, 2, 30, 2, 30, 2, + 27, 5, 27, 5, 27, 5, 27, 5, 24, 8, 24, 8, 24, 8, 24, 8, + 21, 11, 21, 11, 21, 11, 21, 11, 18, 14, 18, 14, 18, 14, 18, 14, + 15, 17, 15, 17, 15, 17, 15, 17, 12, 20, 12, 20, 12, 20, 12, 20, + 9, 23, 9, 23, 9, 23, 9, 23, 6, 26, 6, 26, 6, 26, 6, 26, + 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, +}; + #endif INTRA_AVX2_TABLES_H From df5f71aaa11d74e838b79fa311ad2d5832186475 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 16 Feb 2024 16:57:32 +0200 Subject: [PATCH 080/237] Clean up and add missing scripts for generating shuffle vector tables. --- src/strategies/avx2/intra-avx2.c | 1 + src/strategies/avx2/intra_avx2_tables.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8a87df30..f94274dd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2746,6 +2746,7 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: + // TODO: handle extreme wide angles separately. Most wide angles can be handled with the old code. if (wide_angle_mode) angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); else diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 029e6aec..26244cd4 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -103,6 +103,9 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ve }; +// NOTE: shuffle vectors for w8, w16, and w32 vertical do not exists as they are not needed. + + ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[] = { 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 2 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, From 9e77b3bf90c6255f739bb9b24f4b138decd1f154 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 21 Feb 2024 02:06:00 +0200 Subject: [PATCH 081/237] Implement w4 horizontal for wide angles. --- src/strategies/avx2/intra-avx2.c | 46 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f94274dd..d3bc95a9 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1887,36 +1887,42 @@ static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uv coeff_tmp[x] = *(int16_t*)tmp; } - const __m128i vcoeff = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3], - coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3]); + const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], + coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); + const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], + coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); + + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + + const __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { - uvg_pixel src[32]; - - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[d + 0] = ref[delta_int[x] + y + 1 + 0]; - src[d + 1] = ref[delta_int[x] + y + 1 + 1]; - src[d + 8] = ref[delta_int[x] + y + 2 + 0]; - src[d + 9] = ref[delta_int[x] + y + 2 + 1]; - src[d + 16] = ref[delta_int[x] + y + 3 + 0]; - src[d + 17] = ref[delta_int[x] + y + 3 + 1]; - src[d + 24] = ref[delta_int[x] + y + 4 + 0]; - src[d + 25] = ref[delta_int[x] + y + 4 + 1]; - } + const __m256i vidx = _mm256_setr_epi64x(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); + const __m256i vsrc_raw = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx, 1); - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; + __m128i vsrc0 = _mm256_extracti128_si256(vsrc_raw, 0); + __m128i vsrc1 = _mm256_extracti128_si256(vsrc_raw, 1); + + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); res0 = _mm_srai_epi16(res0, 5); res1 = _mm_srai_epi16(res1, 5); + __m128i vfinal = _mm_packus_epi16(res0, res1); + vfinal = _mm_shuffle_epi8(vfinal, vtranspose); - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); + _mm_store_si128((__m128i*)dst, vfinal); dst += 16; } } From a9e11366b33734e5485cc7e9c9c7692fc46e59a3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 21 Feb 2024 14:49:26 +0200 Subject: [PATCH 082/237] Implement w4 horizontal for wide angles. --- src/strategies/avx2/intra-avx2.c | 18 +++++++----------- src/strategies/avx2/intra_avx2_tables.h | 21 ++++++++++++++++++++- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d3bc95a9..d7c5804c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1479,7 +1479,7 @@ static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* r const int16_t* coeff_tmp = (const int16_t*)&coeff_table[coeff_table_offset + (y << 1)]; __m256i vcoeff = _mm256_set1_epi16(*coeff_tmp); - __m128i vsrc[4]; + ALIGNED(32) __m128i vsrc[4]; vsrc[0] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 0 + 1]); vsrc[1] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 16 + 1]); // Flip these two middle sources. They will be later flipped back into place by packus vsrc[2] = _mm_loadu_si128((const __m128i*) & ref[delta_int[y] + 8 + 1]); @@ -1881,16 +1881,11 @@ static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uv const int width = 4; const __m128i v16s = _mm_set1_epi16(16); - int16_t coeff_tmp[4]; - for (int x = 0; x < width; ++x) { - int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; - coeff_tmp[x] = *(int16_t*)tmp; - } + const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; + const int table_offset = mode_idx * 32; - const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], - coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); - const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], - coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); + const __m128i vcoeff0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[table_offset + 0]); + const __m128i vcoeff1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[table_offset + 16]); const __m128i vshuf = _mm_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -1902,9 +1897,10 @@ static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uv 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f ); + const __m256i vidx = _mm256_setr_epi64x(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); + // Height has to be at least 4, handle 4 lines at once for (int y = 0; y < height; y += 4) { - const __m256i vidx = _mm256_setr_epi64x(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); const __m256i vsrc_raw = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx, 1); __m128i vsrc0 = _mm256_extracti128_si256(vsrc_raw, 0); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 26244cd4..28ba3ee1 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -31,7 +31,7 @@ ALIGNED(32) const int8_t coeff_vector128_num_by_mode_wide_angle[14] = { }; -const int16_t coeff_table_mode_offsets[33] = { +ALIGNED(32) const int16_t coeff_table_mode_offsets[33] = { 0, 16, 272, 400, 656, 720, 848, 864, 992, 1056, 1184, 1216, 1344, 1408, 1664, 1792, 2048, 2064, 2320, 2448, 2704, 2768, 2896, 2928, 3056, 3120, 3248, 3264, 3392, 3456, 3712, 3840, 4096 }; @@ -1139,4 +1139,23 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver_wide_a 3, 29, 3, 29, 3, 29, 3, 29, 32, 0, 32, 0, 32, 0, 32, 0, }; + +// Chroma linear interpolation filter weights for width 4, horizontal wide angle modes. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 + 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, // Mode -11 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 + 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -9 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -8 + 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, // Mode -7 + 10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, // Mode -6 + 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, // Mode -5 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -4 + 7, 25, 7, 25, 7, 25, 7, 25, 14, 18, 14, 18, 14, 18, 14, 18, 21, 11, 21, 11, 21, 11, 21, 11, 28, 4, 28, 4, 28, 4, 28, 4, // Mode -3 + 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -2 + 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, // Mode -1 + 25, 7, 25, 7, 25, 7, 25, 7, 18, 14, 18, 14, 18, 14, 18, 14, 11, 21, 11, 21, 11, 21, 11, 21, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 0 + 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 1 +}; + #endif INTRA_AVX2_TABLES_H From c2683f65348811c1b6cccdb0d3ffe9a4373783b1 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 21 Feb 2024 15:29:51 +0200 Subject: [PATCH 083/237] Remove the w4 vertical wide angle placeholders. --- src/strategies/avx2/intra-avx2.c | 110 ++----------------------------- 1 file changed, 6 insertions(+), 104 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d7c5804c..3968bf41 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1620,10 +1620,9 @@ static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; - const int16_t* dfract = delta_fract; const __m128i v16s = _mm_set1_epi16(16); const int16_t weigth_offset = (mode - 2) * 64; const int16_t shuf_offset = (mode - 2) * 64; @@ -1672,98 +1671,6 @@ static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* r } -// Can handle most of the wide angle modes. -static void angular_pred_avx2_linear_filter_w4_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) -{ - const int width = 4; - const int16_t* dint = delta_int; - const __m128i v16s = _mm_set1_epi16(16); - // Height has to be at least 4, handle 4 lines at once - for (int y = 0; y < height; y += 4) { - uvg_pixel src[32]; - int16_t coeff_tmp[4]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int yy = 0; yy < 4; ++yy) { - src[yy * 8 + 0] = ref[dint[yy] + 1 + 0]; - src[yy * 8 + 1] = ref[dint[yy] + 1 + 1]; - src[yy * 8 + 2] = ref[dint[yy] + 1 + 1]; - src[yy * 8 + 3] = ref[dint[yy] + 1 + 2]; - src[yy * 8 + 4] = ref[dint[yy] + 1 + 2]; - src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; - src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; - src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; - int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; - coeff_tmp[yy] = *(int16_t*)tmp; - } - dint += 4; - - const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], - coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); - const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], - coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); - - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; - - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); - - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; - } -} - - -// Handles the extreme wide angle modes, as these need a different memory load pattern. -static void angular_pred_avx2_linear_filter_w4_ver_extreme_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) -{ - const int width = 4; - const int16_t* dint = delta_int; - const __m128i v16s = _mm_set1_epi16(16); - // Height has to be at least 4, handle 4 lines at once - for (int y = 0; y < height; y += 4) { - uvg_pixel src[32]; - int16_t coeff_tmp[4]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int yy = 0; yy < 4; ++yy) { - src[yy * 8 + 0] = ref[dint[yy] + 1 + 0]; - src[yy * 8 + 1] = ref[dint[yy] + 1 + 1]; - src[yy * 8 + 2] = ref[dint[yy] + 1 + 1]; - src[yy * 8 + 3] = ref[dint[yy] + 1 + 2]; - src[yy * 8 + 4] = ref[dint[yy] + 1 + 2]; - src[yy * 8 + 5] = ref[dint[yy] + 1 + 3]; - src[yy * 8 + 6] = ref[dint[yy] + 1 + 3]; - src[yy * 8 + 7] = ref[dint[yy] + 1 + 4]; - int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; - coeff_tmp[yy] = *(int16_t*)tmp; - } - dint += 4; - - const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], - coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); - const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], - coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); - - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; - - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); - - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; - } -} - - static void angular_pred_avx2_linear_filter_w8_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 8; @@ -1876,7 +1783,7 @@ static void angular_pred_avx2_linear_filter_w32_ver_wide_angle(uvg_pixel* dst, u } -static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int width = 4; const __m128i v16s = _mm_set1_epi16(16); @@ -2747,13 +2654,8 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { - case 4: - // TODO: handle extreme wide angles separately. Most wide angles can be handled with the old code. - if (wide_angle_mode) - angular_pred_avx2_linear_filter_w4_ver_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); - else - angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, pred_mode); - break; + // No wide angle handling for w4 is needed. + case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, pred_mode); break; case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, pred_mode); break; case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, pred_mode); break; @@ -2765,7 +2667,7 @@ static void uvg_angular_pred_avx2( else { if (wide_angle_mode) { switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 4: angular_pred_avx2_linear_filter_w4_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int); break; case 8: angular_pred_avx2_linear_filter_w8_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 16: angular_pred_avx2_linear_filter_w16_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 32: assert(false && "This code branch only works with UVG_FORMAT_P420."); break; // This branch is never executed with UVG_FORMAT_P420, due to chroma being only 32 width or height. @@ -2779,7 +2681,7 @@ static void uvg_angular_pred_avx2( case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, pred_mode, delta_int); break; case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, pred_mode, delta_int); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; From 802f01fda5997039ec293594fda0ac3e390bed6a Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 21 Feb 2024 17:56:06 +0200 Subject: [PATCH 084/237] Implement w8 horizontal for wide angles. --- src/strategies/avx2/intra-avx2.c | 70 +++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3968bf41..e79d0d24 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1785,7 +1785,6 @@ static void angular_pred_avx2_linear_filter_w32_ver_wide_angle(uvg_pixel* dst, u static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { - const int width = 4; const __m128i v16s = _mm_set1_epi16(16); const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; @@ -1842,11 +1841,31 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv coeff_tmp[x] = *(int16_t*)tmp; } - const __m128i vcoeff = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3], - coeff_tmp[4], coeff_tmp[5], coeff_tmp[6], coeff_tmp[7]); + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); - // Height has to be at least 2, handle 2 lines at once - for (int y = 0; y < height; y += 2) { + const __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + + const __m256i vidx0 = _mm256_setr_epi64x(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); + const __m256i vidx1 = _mm256_setr_epi64x(delta_int[4], delta_int[5], delta_int[6], delta_int[7]); + + const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], + coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); + const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], + coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); + const __m128i vcoeff2 = _mm_setr_epi16(coeff_tmp[4], coeff_tmp[4], coeff_tmp[4], coeff_tmp[4], + coeff_tmp[5], coeff_tmp[5], coeff_tmp[5], coeff_tmp[5]); + const __m128i vcoeff3 = _mm_setr_epi16(coeff_tmp[6], coeff_tmp[6], coeff_tmp[6], coeff_tmp[6], + coeff_tmp[7], coeff_tmp[7], coeff_tmp[7], coeff_tmp[7]); + + // Height has to be at least 2. Handle as 4x4 blocks. Special handling needed when height == 2. + // TODO: make sure this function is not called when height is 2. + for (int y = 0; y < height; y += 4) { uvg_pixel src[32]; // TODO: get rid of this slow crap, this is just here to test the calculations for (int x = 0, d = 0; x < width; ++x, d += 2) { @@ -1856,18 +1875,47 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv src[d + 17] = ref[delta_int[x] + y + 2 + 1]; } - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; + const __m256i vsrc_raw0 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx0, 1); + const __m256i vsrc_raw1 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx1, 1); - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); + __m128i vsrc0 = _mm256_extracti128_si256(vsrc_raw0, 0); + __m128i vsrc1 = _mm256_extracti128_si256(vsrc_raw0, 1); + __m128i vsrc2 = _mm256_extracti128_si256(vsrc_raw1, 0); + __m128i vsrc3 = _mm256_extracti128_si256(vsrc_raw1, 1); + + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); + vsrc2 = _mm_shuffle_epi8(vsrc2, vshuf); + vsrc3 = _mm_shuffle_epi8(vsrc3, vshuf); + + const __m128i* kek0 = (const __m128i*) & src[0]; + const __m128i* kek1 = (const __m128i*) & src[16]; + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); + __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff2); + __m128i res3 = _mm_maddubs_epi16(vsrc3, vcoeff3); res0 = _mm_add_epi16(res0, v16s); res1 = _mm_add_epi16(res1, v16s); + res2 = _mm_add_epi16(res2, v16s); + res3 = _mm_add_epi16(res3, v16s); res0 = _mm_srai_epi16(res0, 5); res1 = _mm_srai_epi16(res1, 5); + res2 = _mm_srai_epi16(res2, 5); + res3 = _mm_srai_epi16(res3, 5); - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; + __m128i vtmp0 = _mm_packus_epi16(res0, res1); + __m128i vtmp1 = _mm_packus_epi16(res2, res3); + vtmp0 = _mm_shuffle_epi8(vtmp0, vtranspose); + vtmp1 = _mm_shuffle_epi8(vtmp1, vtranspose); + + __m128i vfinal0 = _mm_unpacklo_epi32(vtmp0, vtmp1); + __m128i vfinal1 = _mm_unpackhi_epi32(vtmp0, vtmp1); + + + _mm_store_si128((__m128i*)&dst[0], vfinal0); + _mm_store_si128((__m128i*)&dst[16], vfinal1); + dst += 32; } } From 5dd7ca1a84b8498f549cb952e7437183c20ae0b6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 21 Feb 2024 19:24:59 +0200 Subject: [PATCH 085/237] Implement w16 horizontal for wide angles. --- src/strategies/avx2/intra-avx2.c | 100 ++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e79d0d24..1120dde5 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1888,9 +1888,6 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv vsrc2 = _mm_shuffle_epi8(vsrc2, vshuf); vsrc3 = _mm_shuffle_epi8(vsrc3, vshuf); - const __m128i* kek0 = (const __m128i*) & src[0]; - const __m128i* kek1 = (const __m128i*) & src[16]; - __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff0); __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff1); __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff2); @@ -1925,38 +1922,87 @@ static void angular_pred_avx2_linear_filter_w16_hor_wide_angle(uvg_pixel* dst, u const int width = 16; const __m128i v16s = _mm_set1_epi16(16); + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + ); + + const __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + int16_t coeff_tmp[16]; for (int x = 0; x < width; ++x) { int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; coeff_tmp[x] = *(int16_t*)tmp; } - const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[1], coeff_tmp[2], coeff_tmp[3], - coeff_tmp[4], coeff_tmp[5], coeff_tmp[6], coeff_tmp[7]); - const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[ 8], coeff_tmp[ 9], coeff_tmp[10], coeff_tmp[11], - coeff_tmp[12], coeff_tmp[13], coeff_tmp[14], coeff_tmp[15]); - - // Height has to be at least 2, handle 1 line at a time - for (int y = 0; y < height; ++y) { - uvg_pixel src[32]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[d + 0] = ref[delta_int[x] + y + 1 + 0]; - src[d + 1] = ref[delta_int[x] + y + 1 + 1]; - } - - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; + __m256i vidx[4]; + for (int i = 0, d = 0; i < 4; ++i, d += 4) { + vidx[i] = _mm256_setr_epi64x(delta_int[d + 0], delta_int[d + 1], delta_int[d + 2], delta_int[d + 3]); + } - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); + __m128i vcoeff[8]; + for (int i = 0, c = 0; i < 8; ++i, c += 2) { + vcoeff[i] = _mm_setr_epi16(coeff_tmp[c + 0], coeff_tmp[c + 0], coeff_tmp[c + 0], coeff_tmp[c + 0], + coeff_tmp[c + 1], coeff_tmp[c + 1], coeff_tmp[c + 1], coeff_tmp[c + 1]); + } - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; + // Height has to be at least 2. Handle as 4x4 blocks. Special handling needed when height < 4. + // TODO: make sure this function is not called when height is less than 4. + for (int y = 0; y < height; y += 4) { + __m128i vtmp[4]; + for (int x = 0, v = 0, c = 0; x < width; x += 8, v += 2, c += 4) { + const __m256i vsrc_raw0 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx[v + 0], 1); + const __m256i vsrc_raw1 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx[v + 1], 1); + + __m128i vsrc0 = _mm256_extracti128_si256(vsrc_raw0, 0); + __m128i vsrc1 = _mm256_extracti128_si256(vsrc_raw0, 1); + __m128i vsrc2 = _mm256_extracti128_si256(vsrc_raw1, 0); + __m128i vsrc3 = _mm256_extracti128_si256(vsrc_raw1, 1); + + vsrc0 = _mm_shuffle_epi8(vsrc0, vshuf); + vsrc1 = _mm_shuffle_epi8(vsrc1, vshuf); + vsrc2 = _mm_shuffle_epi8(vsrc2, vshuf); + vsrc3 = _mm_shuffle_epi8(vsrc3, vshuf); + + __m128i res0 = _mm_maddubs_epi16(vsrc0, vcoeff[c + 0]); + __m128i res1 = _mm_maddubs_epi16(vsrc1, vcoeff[c + 1]); + __m128i res2 = _mm_maddubs_epi16(vsrc2, vcoeff[c + 2]); + __m128i res3 = _mm_maddubs_epi16(vsrc3, vcoeff[c + 3]); + res0 = _mm_add_epi16(res0, v16s); + res1 = _mm_add_epi16(res1, v16s); + res2 = _mm_add_epi16(res2, v16s); + res3 = _mm_add_epi16(res3, v16s); + res0 = _mm_srai_epi16(res0, 5); + res1 = _mm_srai_epi16(res1, 5); + res2 = _mm_srai_epi16(res2, 5); + res3 = _mm_srai_epi16(res3, 5); + + vtmp[v + 0] = _mm_packus_epi16(res0, res1); + vtmp[v + 1] = _mm_packus_epi16(res2, res3); + } + vtmp[0] = _mm_shuffle_epi8(vtmp[0], vtranspose); + vtmp[1] = _mm_shuffle_epi8(vtmp[1], vtranspose); + vtmp[2] = _mm_shuffle_epi8(vtmp[2], vtranspose); + vtmp[3] = _mm_shuffle_epi8(vtmp[3], vtranspose); + + __m128i vupk32_lo0 = _mm_unpacklo_epi32(vtmp[0], vtmp[1]); + __m128i vupk32_hi0 = _mm_unpackhi_epi32(vtmp[0], vtmp[1]); + __m128i vupk32_lo1 = _mm_unpacklo_epi32(vtmp[2], vtmp[3]); + __m128i vupk32_hi1 = _mm_unpackhi_epi32(vtmp[2], vtmp[3]); + + __m128i vfinal0 = _mm_unpacklo_epi64(vupk32_lo0, vupk32_lo1); + __m128i vfinal1 = _mm_unpackhi_epi64(vupk32_lo0, vupk32_lo1); + __m128i vfinal2 = _mm_unpacklo_epi64(vupk32_hi0, vupk32_hi1); + __m128i vfinal3 = _mm_unpackhi_epi64(vupk32_hi0, vupk32_hi1); + + _mm_store_si128((__m128i*) & dst[0], vfinal0); + _mm_store_si128((__m128i*) & dst[16], vfinal1); + _mm_store_si128((__m128i*) & dst[32], vfinal2); + _mm_store_si128((__m128i*) & dst[48], vfinal3); + dst += 64; } } From a55e2931b92bf4544bf0faeb71bb2d95e37bcf9c Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 22 Feb 2024 15:54:45 +0200 Subject: [PATCH 086/237] Load weights from table in w8 and w16. Replace w4 weight table with w16 table. All horizontal wide angle functions can use this same table. --- src/strategies/avx2/intra-avx2.c | 41 +++---- src/strategies/avx2/intra_avx2_tables.h | 148 +++++++++++++++++++++--- 2 files changed, 148 insertions(+), 41 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1120dde5..930e7586 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1788,10 +1788,10 @@ static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uv const __m128i v16s = _mm_set1_epi16(16); const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; - const int table_offset = mode_idx * 32; + const int table_offset = mode_idx * 128; - const __m128i vcoeff0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[table_offset + 0]); - const __m128i vcoeff1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[table_offset + 16]); + const __m128i vcoeff0 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + 0]); + const __m128i vcoeff1 = _mm_load_si128((const __m128i*) &intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + 16]); const __m128i vshuf = _mm_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -1835,12 +1835,6 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv const int width = 8; const __m128i v16s = _mm_set1_epi16(16); - int16_t coeff_tmp[8]; - for (int x = 0; x < width; ++x) { - int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; - coeff_tmp[x] = *(int16_t*)tmp; - } - const __m128i vshuf = _mm_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c @@ -1854,14 +1848,13 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv const __m256i vidx0 = _mm256_setr_epi64x(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); const __m256i vidx1 = _mm256_setr_epi64x(delta_int[4], delta_int[5], delta_int[6], delta_int[7]); - const __m128i vcoeff0 = _mm_setr_epi16(coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], coeff_tmp[0], - coeff_tmp[1], coeff_tmp[1], coeff_tmp[1], coeff_tmp[1]); - const __m128i vcoeff1 = _mm_setr_epi16(coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], coeff_tmp[2], - coeff_tmp[3], coeff_tmp[3], coeff_tmp[3], coeff_tmp[3]); - const __m128i vcoeff2 = _mm_setr_epi16(coeff_tmp[4], coeff_tmp[4], coeff_tmp[4], coeff_tmp[4], - coeff_tmp[5], coeff_tmp[5], coeff_tmp[5], coeff_tmp[5]); - const __m128i vcoeff3 = _mm_setr_epi16(coeff_tmp[6], coeff_tmp[6], coeff_tmp[6], coeff_tmp[6], - coeff_tmp[7], coeff_tmp[7], coeff_tmp[7], coeff_tmp[7]); + const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; + const int table_offset = mode_idx * 128; + + const __m128i vcoeff0 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + 0]); + const __m128i vcoeff1 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + 16]); + const __m128i vcoeff2 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + 32]); + const __m128i vcoeff3 = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + 48]); // Height has to be at least 2. Handle as 4x4 blocks. Special handling needed when height == 2. // TODO: make sure this function is not called when height is 2. @@ -1932,21 +1925,17 @@ static void angular_pred_avx2_linear_filter_w16_hor_wide_angle(uvg_pixel* dst, u 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f ); - int16_t coeff_tmp[16]; - for (int x = 0; x < width; ++x) { - int8_t tmp[2] = { 32 - delta_fract[x], delta_fract[x] }; - coeff_tmp[x] = *(int16_t*)tmp; - } - __m256i vidx[4]; for (int i = 0, d = 0; i < 4; ++i, d += 4) { vidx[i] = _mm256_setr_epi64x(delta_int[d + 0], delta_int[d + 1], delta_int[d + 2], delta_int[d + 3]); } + const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; + const int table_offset = mode_idx * 128; + __m128i vcoeff[8]; - for (int i = 0, c = 0; i < 8; ++i, c += 2) { - vcoeff[i] = _mm_setr_epi16(coeff_tmp[c + 0], coeff_tmp[c + 0], coeff_tmp[c + 0], coeff_tmp[c + 0], - coeff_tmp[c + 1], coeff_tmp[c + 1], coeff_tmp[c + 1], coeff_tmp[c + 1]); + for (int i = 0, o = 0; i < 8; ++i, o += 16) { + vcoeff[i] = _mm_load_si128((const __m128i*) & intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[table_offset + o]); } // Height has to be at least 2. Handle as 4x4 blocks. Special handling needed when height < 4. diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 28ba3ee1..5084076e 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1141,21 +1141,139 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver_wide_a // Chroma linear interpolation filter weights for width 4, horizontal wide angle modes. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[] = { - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 - 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, // Mode -11 - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 - 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -9 - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -8 - 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, // Mode -7 - 10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, // Mode -6 - 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, // Mode -5 - 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -4 - 7, 25, 7, 25, 7, 25, 7, 25, 14, 18, 14, 18, 14, 18, 14, 18, 21, 11, 21, 11, 21, 11, 21, 11, 28, 4, 28, 4, 28, 4, 28, 4, // Mode -3 - 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -2 - 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, // Mode -1 - 25, 7, 25, 7, 25, 7, 25, 7, 18, 14, 18, 14, 18, 14, 18, 14, 11, 21, 11, 21, 11, 21, 11, 21, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 0 - 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 1 +//ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor_wide_angle[] = { +// 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 +// 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, // Mode -11 +// 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 +// 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -9 +// 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -8 +// 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, // Mode -7 +// 10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, // Mode -6 +// 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, // Mode -5 +// 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -4 +// 7, 25, 7, 25, 7, 25, 7, 25, 14, 18, 14, 18, 14, 18, 14, 18, 21, 11, 21, 11, 21, 11, 21, 11, 28, 4, 28, 4, 28, 4, 28, 4, // Mode -3 +// 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -2 +// 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, // Mode -1 +// 25, 7, 25, 7, 25, 7, 25, 7, 18, 14, 18, 14, 18, 14, 18, 14, 11, 21, 11, 21, 11, 21, 11, 21, 4, 28, 4, 28, 4, 28, 4, 28, // Mode 0 +// 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, // Mode 1 +//}; + + +// NOTE: this table can also be used by horizontal w4 and w8 wide angle functions since their tables are just a subset of this one. +// Chroma linear interpolation filter weights for width 4, horizontal wide angle modes. +ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[] = { + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, // Mode -11 + 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, + 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, + 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, + 3, 29, 3, 29, 3, 29, 3, 29, 14, 18, 14, 18, 14, 18, 14, 18, + 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, + 15, 17, 15, 17, 15, 17, 15, 17, 26, 6, 26, 6, 26, 6, 26, 6, + 5, 27, 5, 27, 5, 27, 5, 27, 16, 16, 16, 16, 16, 16, 16, 16, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, // Mode -9 + 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, + 9, 23, 9, 23, 9, 23, 9, 23, 30, 2, 30, 2, 30, 2, 30, 2, + 19, 13, 19, 13, 19, 13, 19, 13, 8, 24, 8, 24, 8, 24, 8, 24, + 29, 3, 29, 3, 29, 3, 29, 3, 18, 14, 18, 14, 18, 14, 18, 14, + 7, 25, 7, 25, 7, 25, 7, 25, 28, 4, 28, 4, 28, 4, 28, 4, + 17, 15, 17, 15, 17, 15, 17, 15, 6, 26, 6, 26, 6, 26, 6, 26, + 27, 5, 27, 5, 27, 5, 27, 5, 16, 16, 16, 16, 16, 16, 16, 16, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -8 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -7 + 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24, + 2, 30, 2, 30, 2, 30, 2, 30, 28, 4, 28, 4, 28, 4, 28, 4, + 22, 10, 22, 10, 22, 10, 22, 10, 16, 16, 16, 16, 16, 16, 16, 16, + 10, 22, 10, 22, 10, 22, 10, 22, 4, 28, 4, 28, 4, 28, 4, 28, + 30, 2, 30, 2, 30, 2, 30, 2, 24, 8, 24, 8, 24, 8, 24, 8, + 18, 14, 18, 14, 18, 14, 18, 14, 12, 20, 12, 20, 12, 20, 12, 20, + 6, 26, 6, 26, 6, 26, 6, 26, 32, 0, 32, 0, 32, 0, 32, 0, + 10, 22, 10, 22, 10, 22, 10, 22, 20, 12, 20, 12, 20, 12, 20, 12, // Mode -6 + 30, 2, 30, 2, 30, 2, 30, 2, 8, 24, 8, 24, 8, 24, 8, 24, + 18, 14, 18, 14, 18, 14, 18, 14, 28, 4, 28, 4, 28, 4, 28, 4, + 6, 26, 6, 26, 6, 26, 6, 26, 16, 16, 16, 16, 16, 16, 16, 16, + 26, 6, 26, 6, 26, 6, 26, 6, 4, 28, 4, 28, 4, 28, 4, 28, + 14, 18, 14, 18, 14, 18, 14, 18, 24, 8, 24, 8, 24, 8, 24, 8, + 2, 30, 2, 30, 2, 30, 2, 30, 12, 20, 12, 20, 12, 20, 12, 20, + 22, 10, 22, 10, 22, 10, 22, 10, 32, 0, 32, 0, 32, 0, 32, 0, + 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, // Mode -5 + 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, + 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, + 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, + 15, 17, 15, 17, 15, 17, 15, 17, 6, 26, 6, 26, 6, 26, 6, 26, + 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, + 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, + 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -4 + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, + 7, 25, 7, 25, 7, 25, 7, 25, 14, 18, 14, 18, 14, 18, 14, 18, // Mode -3 + 21, 11, 21, 11, 21, 11, 21, 11, 28, 4, 28, 4, 28, 4, 28, 4, + 3, 29, 3, 29, 3, 29, 3, 29, 10, 22, 10, 22, 10, 22, 10, 22, + 17, 15, 17, 15, 17, 15, 17, 15, 24, 8, 24, 8, 24, 8, 24, 8, + 31, 1, 31, 1, 31, 1, 31, 1, 6, 26, 6, 26, 6, 26, 6, 26, + 13, 19, 13, 19, 13, 19, 13, 19, 20, 12, 20, 12, 20, 12, 20, 12, + 27, 5, 27, 5, 27, 5, 27, 5, 2, 30, 2, 30, 2, 30, 2, 30, + 9, 23, 9, 23, 9, 23, 9, 23, 16, 16, 16, 16, 16, 16, 16, 16, + 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, // Mode -2 + 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, + 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, + 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, + 21, 11, 21, 11, 21, 11, 21, 11, 2, 30, 2, 30, 2, 30, 2, 30, + 15, 17, 15, 17, 15, 17, 15, 17, 28, 4, 28, 4, 28, 4, 28, 4, + 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, + 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, + 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, // Mode -1 + 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, + 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, + 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, + 11, 21, 11, 21, 11, 21, 11, 21, 30, 2, 30, 2, 30, 2, 30, 2, + 17, 15, 17, 15, 17, 15, 17, 15, 4, 28, 4, 28, 4, 28, 4, 28, + 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, + 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, + 25, 7, 25, 7, 25, 7, 25, 7, 18, 14, 18, 14, 18, 14, 18, 14, // Mode 0 + 11, 21, 11, 21, 11, 21, 11, 21, 4, 28, 4, 28, 4, 28, 4, 28, + 29, 3, 29, 3, 29, 3, 29, 3, 22, 10, 22, 10, 22, 10, 22, 10, + 15, 17, 15, 17, 15, 17, 15, 17, 8, 24, 8, 24, 8, 24, 8, 24, + 1, 31, 1, 31, 1, 31, 1, 31, 26, 6, 26, 6, 26, 6, 26, 6, + 19, 13, 19, 13, 19, 13, 19, 13, 12, 20, 12, 20, 12, 20, 12, 20, + 5, 27, 5, 27, 5, 27, 5, 27, 30, 2, 30, 2, 30, 2, 30, 2, + 23, 9, 23, 9, 23, 9, 23, 9, 16, 16, 16, 16, 16, 16, 16, 16, + 29, 3, 29, 3, 29, 3, 29, 3, 26, 6, 26, 6, 26, 6, 26, 6, // Mode 1 + 23, 9, 23, 9, 23, 9, 23, 9, 20, 12, 20, 12, 20, 12, 20, 12, + 17, 15, 17, 15, 17, 15, 17, 15, 14, 18, 14, 18, 14, 18, 14, 18, + 11, 21, 11, 21, 11, 21, 11, 21, 8, 24, 8, 24, 8, 24, 8, 24, + 5, 27, 5, 27, 5, 27, 5, 27, 2, 30, 2, 30, 2, 30, 2, 30, + 31, 1, 31, 1, 31, 1, 31, 1, 28, 4, 28, 4, 28, 4, 28, 4, + 25, 7, 25, 7, 25, 7, 25, 7, 22, 10, 22, 10, 22, 10, 22, 10, + 19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, }; #endif INTRA_AVX2_TABLES_H From abb6552c72370973e694d778146d4eac728a9798 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 23 Feb 2024 03:17:52 +0200 Subject: [PATCH 087/237] Clean up w8. --- src/strategies/avx2/intra-avx2.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 930e7586..0c3a5144 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1859,15 +1859,6 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv // Height has to be at least 2. Handle as 4x4 blocks. Special handling needed when height == 2. // TODO: make sure this function is not called when height is 2. for (int y = 0; y < height; y += 4) { - uvg_pixel src[32]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[d + 0] = ref[delta_int[x] + y + 1 + 0]; - src[d + 1] = ref[delta_int[x] + y + 1 + 1]; - src[d + 16] = ref[delta_int[x] + y + 2 + 0]; - src[d + 17] = ref[delta_int[x] + y + 2 + 1]; - } - const __m256i vsrc_raw0 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx0, 1); const __m256i vsrc_raw1 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx1, 1); From bb23a9da87399a33fb0f6600cbd543826d98bfd5 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 23 Feb 2024 14:37:00 +0200 Subject: [PATCH 088/237] Replace setr with load and cvtepi. --- src/strategies/avx2/intra-avx2.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0c3a5144..4f643a8e 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1845,8 +1845,13 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f ); - const __m256i vidx0 = _mm256_setr_epi64x(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); - const __m256i vidx1 = _mm256_setr_epi64x(delta_int[4], delta_int[5], delta_int[6], delta_int[7]); + const __m128i vidxshuf = _mm_setr_epi8(0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff); // Don't care + __m128i vidx_raw = _mm_load_si128((__m128i*)delta_int); + + const __m256i vidx0 = _mm256_cvtepi16_epi64(vidx_raw); + vidx_raw = _mm_shuffle_epi8(vidx_raw, vidxshuf); + const __m256i vidx1 = _mm256_cvtepi16_epi64(vidx_raw); const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; const int table_offset = mode_idx * 128; @@ -1916,10 +1921,20 @@ static void angular_pred_avx2_linear_filter_w16_hor_wide_angle(uvg_pixel* dst, u 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f ); + const __m128i vidxshuf = _mm_setr_epi8(0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff); // Don't care + + __m128i vidx_raw0 = _mm_load_si128((__m128i*) & delta_int[0]); + __m128i vidx_raw1 = _mm_load_si128((__m128i*) & delta_int[8]); + __m256i vidx[4]; - for (int i = 0, d = 0; i < 4; ++i, d += 4) { - vidx[i] = _mm256_setr_epi64x(delta_int[d + 0], delta_int[d + 1], delta_int[d + 2], delta_int[d + 3]); - } + vidx[0] = _mm256_cvtepi16_epi64(vidx_raw0); + vidx_raw0 = _mm_shuffle_epi8(vidx_raw0, vidxshuf); + vidx[1] = _mm256_cvtepi16_epi64(vidx_raw0); + + vidx[2] = _mm256_cvtepi16_epi64(vidx_raw1); + vidx_raw1 = _mm_shuffle_epi8(vidx_raw1, vidxshuf); + vidx[3] = _mm256_cvtepi16_epi64(vidx_raw1); const int mode_idx = mode < 2 ? mode + 12 : 80 - mode; const int table_offset = mode_idx * 128; From ea2b607151bfeafbf997d4218c1c80afdf3a1214 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 28 Feb 2024 02:56:29 +0200 Subject: [PATCH 089/237] Finish alternate version of horizontal pdpc w4. This uses pre-calculated weights loaded from tables instead of calculating weights during runtime. --- src/strategies/avx2/intra-avx2.c | 21 ++++++++------------- src/strategies/avx2/intra_avx2_tables.h | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 4f643a8e..3e994db3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2458,29 +2458,24 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); __m128i vidx = _mm_slli_epi32(vseq, log2_width); __m256i v32s = _mm256_set1_epi16(32); - __m256i vwT_shuffle = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, - 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, - 6, 7, 6, 7, 6, 7, 6, 7); - for (int y = 0; y < limit; y += 4) { + + // Scale can be 0, 1 or 2 + const int table_offset = scale * 64; + + for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { int inv_angle_sum = 256 + (y + yy + 1) * inv_sample_disp; - - // Set weight to zero if limit reached. - // This removes the need to blend results with unmodified values in the end. - wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; for (int x = 0; x < 4; ++x) { ref_top[yy][x] = ref_side[(x) + (inv_angle_sum >> 9) + 1]; } } + const int offset = table_offset + o; __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); - uint64_t quad; - memcpy(&quad, wT, sizeof(quad)); - __m256i vwT = _mm256_set1_epi64x(quad); - vwT = _mm256_shuffle_epi8(vwT, vwT_shuffle); + __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_hor_weight[offset]); + __m256i accu = _mm256_sub_epi16(vtop, vpred16); accu = _mm256_mullo_epi16(vwT, accu); accu = _mm256_add_epi16(accu, v32s); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 5084076e..d73f3d64 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1276,4 +1276,21 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor_wide_ 19, 13, 19, 13, 19, 13, 19, 13, 16, 16, 16, 16, 16, 16, 16, 16, }; + +// Weights for intra pdpc w4 horizontal. +ALIGNED(32) const int16_t intra_pdpc_w4_hor_weight[] = { + 32, 32, 32, 32, 8, 8, 8, 8, 2, 2, 2, 2, 0, 0, 0, 0, // Scale 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 32, 32, 32, 16, 16, 16, 16, 8, 8, 8, 8, 4, 4, 4, 4, // Scale 1 + 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, // Scale 2 + 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + #endif INTRA_AVX2_TABLES_H From a80e113a39e58ae79b9603090cbe4a1c2511f7ea Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 28 Feb 2024 13:45:43 +0200 Subject: [PATCH 090/237] Change log2 width to a magic number instead of fetching from table. Added comment for clarity. --- src/strategies/avx2/intra-avx2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3e994db3..9fbc4ce6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2453,10 +2453,9 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, int16_t ref_top[4][4]; int limit = MIN(3 << scale, height); - const int log2_width = uvg_g_convert_to_log2[width]; __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width __m256i v32s = _mm256_set1_epi16(32); // Scale can be 0, 1 or 2 From d5eafd3d58ac174a5549706dbfcc7eae379d5051 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 28 Feb 2024 15:46:07 +0200 Subject: [PATCH 091/237] Implement alternate version of w8 horizontal. Similar to the w4. --- src/strategies/avx2/intra-avx2.c | 22 +++++++------------ src/strategies/avx2/intra_avx2_tables.h | 29 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9fbc4ce6..2223516c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2463,9 +2463,9 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { - int inv_angle_sum = 256 + (y + yy + 1) * inv_sample_disp; + int inv_angle_sum = (256 + (y + yy + 1) * inv_sample_disp) >> 9; for (int x = 0; x < 4; ++x) { - ref_top[yy][x] = ref_side[(x) + (inv_angle_sum >> 9) + 1]; + ref_top[yy][x] = ref_side[x + inv_angle_sum + 1]; } } const int offset = table_offset + o; @@ -2494,21 +2494,16 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width = 8; int limit = MIN(3 << scale, height); - const int log2_width = uvg_g_convert_to_log2[width]; __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - __m128i vidx = _mm_slli_epi64(vseq, log2_width); + __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width __m256i v32s = _mm256_set1_epi16(32); - for (int y = 0; y < limit; y += 2) { - // Set weight to zero if limit reached. - // This removes the need to blend results with unmodified values in the end. - const int16_t wT0 = 32 >> (2 * (y + 0) >> scale); // This cannot reach limit, so do not check - const int16_t wT1 = y + 1 < limit ? 32 >> (2 * (y + 1) >> scale) : 0; + // Scale can be 0, 1 or 2 + const int table_offset = scale * 128; - __m128i vwT[2]; - vwT[0] = _mm_set1_epi16(wT0); - vwT[1] = _mm_set1_epi16(wT1); + for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { + const __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_hor_weight[o]); ALIGNED(32) uvg_pixel tmp[16]; int shifted_inv_angle_sum = (256 + (y + 0 + 1) * inv_sample_disp) >> 9; @@ -2523,7 +2518,7 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); __m256i accu = _mm256_sub_epi16(vtop16, vpred16); - accu = _mm256_mullo_epi16(*(__m256i*)vwT, accu); + accu = _mm256_mullo_epi16(vwT, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); accu = _mm256_add_epi16(vpred16, accu); @@ -2539,7 +2534,6 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { int limit = MIN(3 << scale, height); - const int log2_width = uvg_g_convert_to_log2[width]; __m256i v32s = _mm256_set1_epi16(32); // Handle one line at a time. Skip line if vertical limit reached. diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index d73f3d64..50b4a9ee 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1293,4 +1293,33 @@ ALIGNED(32) const int16_t intra_pdpc_w4_hor_weight[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + +// Weights for intra pdpc w8 horizontal. +ALIGNED(32) const int16_t intra_pdpc_w8_hor_weight[] = { + 32, 32, 32, 32, 32, 32, 32, 32, 8, 8, 8, 8, 8, 8, 8, 8, // Scale 0 + 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16, 16, 16, // Scale 1 + 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, // Scale 2 + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + #endif INTRA_AVX2_TABLES_H From 8b5b3407363df2ebcee0e91ceef92990fe9dca0d Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 1 Mar 2024 11:53:41 +0200 Subject: [PATCH 092/237] Replace x-loop with memcpy. --- src/strategies/avx2/intra-avx2.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 2223516c..f82424a3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2407,9 +2407,7 @@ static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Set weight to zero if limit reached. // This removes the need to blend results with unmodified values in the end. wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; - for (int xx = 0; xx < 4; ++xx) { - ref_top[yy][xx] = ref_side[(x + xx) + (inv_angle_sum >> 9) + 1]; - } + memcpy(ref_top[yy], &ref_side[(x) + (inv_angle_sum >> 9) + 1], 4 * sizeof(int16_t)); } __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); @@ -2464,9 +2462,7 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { int inv_angle_sum = (256 + (y + yy + 1) * inv_sample_disp) >> 9; - for (int x = 0; x < 4; ++x) { - ref_top[yy][x] = ref_side[x + inv_angle_sum + 1]; - } + memcpy(ref_top[yy], &ref_side[(inv_angle_sum >> 9) + 1], 4 * sizeof(int16_t)); } const int offset = table_offset + o; From e091031f75847aedfbe144e1f766d696bb4ad71a Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 1 Mar 2024 15:50:08 +0200 Subject: [PATCH 093/237] Improve intra pdpc w4 vertical. Load weights from table instead of calculating during runtime. --- src/strategies/avx2/intra-avx2.c | 20 ++++++++++---------- src/strategies/avx2/intra_avx2_tables.h | 8 ++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f82424a3..f4ad7433 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2101,7 +2101,6 @@ static void angular_pdpc_ver_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) { const int width = 4; - int16_t wL[4]; int16_t left[4][4]; int limit = MIN(3 << scale, width); @@ -2111,11 +2110,14 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, __m128i vidx = _mm_slli_epi32(vseq, log2_width); __m256i v32s = _mm256_set1_epi16(32); + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_ver_weight[offset]); + // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { for (int xx = 0; xx < width; ++xx) { int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; - wL[xx] = xx < limit ? 32 >> ((2 * xx) >> scale) : 0; for (int yy = 0; yy < 4; ++yy) { left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; @@ -2125,11 +2127,9 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); - uint64_t quad; - memcpy(&quad, wL, sizeof(quad)); - __m256i vwL = _mm256_set1_epi64x(quad); + __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vwL, accu); + accu = _mm256_mullo_epi16(vweight, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); accu = _mm256_add_epi16(vdst16, accu); @@ -2855,12 +2855,12 @@ static void uvg_angular_pred_avx2( } if (PDPC_filter) { if (vertical_mode) - switch (height) { - case 4: angular_pdpc_ver_h4_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; - case 8: angular_pdpc_ver_h8_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + switch (width) { + case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; case 16: // 16 height and higher done with the same function case 32: - case 64: angular_pdpc_ver_h16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; default: assert(false && "Intra PDPC: Invalid width.\n"); } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 50b4a9ee..70aa5496 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1322,4 +1322,12 @@ ALIGNED(32) const int16_t intra_pdpc_w8_hor_weight[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + +// Weights for intra pdpc w4 vertical. +ALIGNED(32) const int16_t intra_pdpc_w4_ver_weight[] = { + 32, 8, 2, 0, 32, 8, 2, 0, 32, 8, 2, 0, 32, 8, 2, 0, // Scale 0 + 32, 16, 8, 4, 32, 16, 8, 4, 32, 16, 8, 4, 32, 16, 8, 4, // Scale 1 + 32, 32, 16, 16, 32, 32, 16, 16, 32, 32, 16, 16, 32, 32, 16, 16, // Scale 2 +}; + #endif INTRA_AVX2_TABLES_H From e5e2a8fbadfc8959234fc1809e0db7ed6c996052 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 1 Mar 2024 16:00:53 +0200 Subject: [PATCH 094/237] Improve intra pdpc w8 vertical. Load weights from table instead of calculating during runtime. --- src/strategies/avx2/intra-avx2.c | 24 +++++++++--------------- src/strategies/avx2/intra_avx2_tables.h | 9 +++++++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f4ad7433..b64227b0 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2104,10 +2104,9 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, int16_t left[4][4]; int limit = MIN(3 << scale, width); - const int log2_width = uvg_g_convert_to_log2[width]; __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); + //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2 width __m256i v32s = _mm256_set1_epi16(32); // Scale can be 0, 1 or 2 @@ -2124,7 +2123,7 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2147,35 +2146,30 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width = 8; int limit = MIN(3 << scale, width); - const int log2_width = uvg_g_convert_to_log2[width]; __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - __m128i vidx = _mm_slli_epi64(vseq, log2_width); + //__m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width __m256i v32s = _mm256_set1_epi16(32); + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_ver_weight[offset]); + // For width 8, height must be at least 2. Handle 2 lines at once. for (int y = 0; y < height; y += 2) { - ALIGNED(32) int16_t wL[8] = {0}; ALIGNED(32) int16_t left[16] = {0}; for (int xx = 0; xx < limit; ++xx) { int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; - wL[xx] = xx < limit ? 32 >> ((2 * xx) >> scale) : 0; - for (int yy = 0; yy < 2; ++yy) { left[yy * width +xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; } } - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vseq, 8); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); - __m128i tmp[2]; - // Duplicate weights - tmp[0] = _mm_load_si128((__m128i*)wL); - tmp[1] = tmp[0]; - __m256i* vwL = (__m256i*)tmp; + __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(*vwL, accu); + accu = _mm256_mullo_epi16(vweight, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); accu = _mm256_add_epi16(vdst16, accu); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 70aa5496..b358a56e 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1330,4 +1330,13 @@ ALIGNED(32) const int16_t intra_pdpc_w4_ver_weight[] = { 32, 32, 16, 16, 32, 32, 16, 16, 32, 32, 16, 16, 32, 32, 16, 16, // Scale 2 }; + +// Weights for intra pdpc w8 vertical. +ALIGNED(32) const int16_t intra_pdpc_w8_ver_weight[] = { + 32, 8, 2, 0, 0, 0, 0, 0, 32, 8, 2, 0, 0, 0, 0, 0, // Scale 0 + 32, 16, 8, 4, 2, 1, 0, 0, 32, 16, 8, 4, 2, 1, 0, 0, // Scale 1 + 32, 32, 16, 16, 8, 8, 4, 4, 32, 32, 16, 16, 8, 8, 4, 4, // Scale 2 +}; + + #endif INTRA_AVX2_TABLES_H From d443f6d330cf07a224f1739e55eb344978e0d570 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 1 Mar 2024 17:02:28 +0200 Subject: [PATCH 095/237] Improve intra pdpc w16 vertical. Load weights from table instead of calculating during runtime. Add table generator scripts. --- src/strategies/avx2/intra-avx2.c | 15 +++++---------- src/strategies/avx2/intra_avx2_tables.h | 8 ++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b64227b0..d321832f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2184,31 +2184,26 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) { - - int limit = MIN(3 << scale, width); - const int log2_width = uvg_g_convert_to_log2[width]; - - __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - __m128i vidx = _mm_slli_epi64(vseq, log2_width); __m256i v32s = _mm256_set1_epi16(32); + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w16_ver_weight[offset]); + for (int y = 0; y < height; ++y) { for (int x = 0; x < limit; x += 16) { - ALIGNED(32) int16_t wL[16] = {0}; ALIGNED(32) int16_t left[16] = {0}; for (int xx = 0; x + xx < limit; ++xx) { int shifted_inv_angle_sum = (256 + (x + xx + 1) * inv_sample_disp) >> 9; - wL[xx] = xx < limit ? 32 >> ((2 * (x + xx)) >> scale) : 0; left[xx] = ref_side[y + shifted_inv_angle_sum + 1]; } __m128i vdst = _mm_load_si128((const __m128i*)(dst + (y * width + x))); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); - __m256i* vwL = (__m256i*)wL; + __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(*vwL, accu); + accu = _mm256_mullo_epi16(vweight, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); accu = _mm256_add_epi16(vdst16, accu); diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index b358a56e..ecc314e4 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1339,4 +1339,12 @@ ALIGNED(32) const int16_t intra_pdpc_w8_ver_weight[] = { }; +// Weights for intra pdpc w16 vertical. +ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { + 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Scale 0 + 32, 16, 8, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Scale 1 + 32, 32, 16, 16, 8, 8, 4, 4, 2, 2, 1, 1, 0, 0, 0, 0, // Scale 2 +}; + + #endif INTRA_AVX2_TABLES_H From 1354ba08f13e48b39f9cd41d97ef5d4e811afd27 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 4 Mar 2024 20:06:59 +0200 Subject: [PATCH 096/237] Revert "Replace x-loop with memcpy." This reverts commit 3373d000dec7838a7da0e7aa0f479acb5f3e4427. --- src/strategies/avx2/intra-avx2.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d321832f..11b3a893 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2396,7 +2396,9 @@ static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Set weight to zero if limit reached. // This removes the need to blend results with unmodified values in the end. wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; - memcpy(ref_top[yy], &ref_side[(x) + (inv_angle_sum >> 9) + 1], 4 * sizeof(int16_t)); + for (int xx = 0; xx < 4; ++xx) { + ref_top[yy][xx] = ref_side[(x + xx) + (inv_angle_sum >> 9) + 1]; + } } __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); @@ -2451,7 +2453,9 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { int inv_angle_sum = (256 + (y + yy + 1) * inv_sample_disp) >> 9; - memcpy(ref_top[yy], &ref_side[(inv_angle_sum >> 9) + 1], 4 * sizeof(int16_t)); + for (int x = 0; x < 4; ++x) { + ref_top[yy][x] = ref_side[x + inv_angle_sum + 1]; + } } const int offset = table_offset + o; From 3731fbe808e4eaaecf5ddd7083b8d4a11204d92f Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 4 Mar 2024 21:26:57 +0200 Subject: [PATCH 097/237] Load pre-calculated shifted inv angle sums from table. Add table generation script. --- src/strategies/avx2/intra-avx2.c | 36 +++++++------ src/strategies/avx2/intra_avx2_tables.h | 69 +++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 15 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 11b3a893..fdea4625 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2434,7 +2434,7 @@ static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -2449,12 +2449,14 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Scale can be 0, 1 or 2 const int table_offset = scale * 64; + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { - int inv_angle_sum = (256 + (y + yy + 1) * inv_sample_disp) >> 9; for (int x = 0; x < 4; ++x) { - ref_top[yy][x] = ref_side[x + inv_angle_sum + 1]; + ref_top[yy][x] = ref_side[x + shifted_inv_angle_sum[y + yy] + 1]; // TODO: this can be done with a 32-bit gather. NOTE: 8-bit values must be extended to 16-bit. } } const int offset = table_offset + o; @@ -2478,7 +2480,7 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 8; @@ -2490,16 +2492,16 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Scale can be 0, 1 or 2 const int table_offset = scale * 128; + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { const __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_hor_weight[o]); ALIGNED(32) uvg_pixel tmp[16]; - int shifted_inv_angle_sum = (256 + (y + 0 + 1) * inv_sample_disp) >> 9; - memcpy(&tmp[0], &ref_side[shifted_inv_angle_sum + 1], 8 * sizeof(uvg_pixel)); - - shifted_inv_angle_sum = (256 + (y + 1 + 1) * inv_sample_disp) >> 9; - memcpy(&tmp[8], &ref_side[shifted_inv_angle_sum + 1], 8 * sizeof(uvg_pixel)); + memcpy(&tmp[0], &ref_side[shifted_inv_angle_sum[y + 0] + 1], 8 * sizeof(uvg_pixel)); + memcpy(&tmp[8], &ref_side[shifted_inv_angle_sum[y + 1] + 1], 8 * sizeof(uvg_pixel)); __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); @@ -2520,20 +2522,24 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) { int limit = MIN(3 << scale, height); __m256i v32s = _mm256_set1_epi16(32); + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + // Handle one line at a time. Skip line if vertical limit reached. for (int y = 0; y < limit; ++y) { const int16_t wT = 32 >> (2 * (y + 0) >> scale); __m256i vwT = _mm256_set1_epi16(wT); - int inv_angle_sum = 256 + (y + 1) * inv_sample_disp; + for (int x = 0; x < width; x += 16) { __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m128i vtop = _mm_load_si128((__m128i*)&ref_side[x + (inv_angle_sum >> 9) + 1]); + __m128i vtop = _mm_load_si128((__m128i*)&ref_side[x + shifted_inv_angle_sum[y] + 1]); __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); __m256i accu = _mm256_sub_epi16(vtop16, vpred16); @@ -2859,11 +2865,11 @@ static void uvg_angular_pred_avx2( } else switch (width) { - case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; - case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); break; + case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; case 16: // 16 width and higher done with the same function case 32: - case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; default: assert(false && "Intra PDPC: Invalid width.\n"); } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index ecc314e4..770174ca 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1347,4 +1347,73 @@ ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { }; +// Pre-calculated shifted inverse angle sums for pdpc for y-values [0, 64]. Grouped by mode_disp. +ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Mode disp 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, // Mode disp 1 +1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, + 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, // Mode disp 2 + 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, + 11, 21, 32, 43, 53, 64, 75, 85, 96, 107, 117, 128, 139, 149, 160, 171, 181, 192, 203, 213, 224, 235, 245, 256, 267, 277, 288, 299, 309, 320, 331, 341, // Mode disp 3 + 352, 363, 373, 384, 395, 405, 416, 427, 437, 448, 459, 469, 480, 491, 501, 512, 523, 533, 544, 555, 565, 576, 587, 597, 608, 619, 629, 640, 651, 661, 672, 683, + 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, // Mode disp 4 + 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, + 5, 11, 16, 21, 27, 32, 37, 43, 48, 53, 59, 64, 69, 75, 80, 85, 91, 96, 101, 107, 112, 117, 123, 128, 133, 139, 144, 149, 155, 160, 165, 171, // Mode disp 5 + 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 341, + 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, // Mode disp 6 + 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, + 3, 6, 10, 13, 16, 19, 22, 26, 29, 32, 35, 38, 42, 45, 48, 51, 54, 58, 61, 64, 67, 70, 74, 77, 80, 83, 86, 90, 93, 96, 99, 102, // Mode disp 7 + 106, 109, 112, 115, 118, 122, 125, 128, 131, 134, 138, 141, 144, 147, 150, 154, 157, 160, 163, 166, 170, 173, 176, 179, 182, 186, 189, 192, 195, 198, 202, 205, + 3, 5, 8, 11, 13, 16, 19, 21, 24, 27, 29, 32, 35, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 85, // Mode disp 8 + 88, 91, 93, 96, 99, 101, 104, 107, 109, 112, 115, 117, 120, 123, 125, 128, 131, 133, 136, 139, 141, 144, 147, 149, 152, 155, 157, 160, 163, 165, 168, 171, + 2, 5, 7, 9, 11, 14, 16, 18, 21, 23, 25, 27, 30, 32, 34, 37, 39, 41, 43, 46, 48, 50, 53, 55, 57, 59, 62, 64, 66, 69, 71, 73, // Mode disp 9 + 75, 78, 80, 82, 85, 87, 89, 91, 94, 96, 98, 101, 103, 105, 107, 110, 112, 114, 117, 119, 121, 123, 126, 128, 130, 133, 135, 137, 139, 142, 144, 146, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // Mode disp 10 + 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, + 2, 4, 5, 7, 9, 11, 12, 14, 16, 18, 20, 21, 23, 25, 27, 28, 30, 32, 34, 36, 37, 39, 41, 43, 44, 46, 48, 50, 52, 53, 55, 57, // Mode disp 11 + 59, 60, 62, 64, 66, 68, 69, 71, 73, 75, 76, 78, 80, 82, 84, 85, 87, 89, 91, 92, 94, 96, 98, 100, 101, 103, 105, 107, 108, 110, 112, 114, + 2, 3, 5, 6, 8, 10, 11, 13, 14, 16, 18, 19, 21, 22, 24, 26, 27, 29, 30, 32, 34, 35, 37, 38, 40, 42, 43, 45, 46, 48, 50, 51, // Mode disp 12 + 53, 54, 56, 58, 59, 61, 62, 64, 66, 67, 69, 70, 72, 74, 75, 77, 78, 80, 82, 83, 85, 86, 88, 90, 91, 93, 94, 96, 98, 99, 101, 102, + 1, 3, 4, 6, 7, 8, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 24, 25, 26, 28, 29, 31, 32, 33, 35, 36, 38, 39, 40, 42, 43, 45, // Mode disp 13 + 46, 47, 49, 50, 51, 53, 54, 56, 57, 58, 60, 61, 63, 64, 65, 67, 68, 70, 71, 72, 74, 75, 76, 78, 79, 81, 82, 83, 85, 86, 88, 89, + 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34, 36, 37, 38, 39, // Mode disp 14 + 41, 42, 43, 44, 46, 47, 48, 49, 50, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, + 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, // Mode disp 15 + 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, // Mode disp 16 + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27, 28, 29, // Mode disp 17 + 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 1, 2, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 25, 26, // Mode disp 18 + 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, + 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, // Mode disp 19 + 23, 24, 25, 26, 26, 27, 28, 28, 29, 30, 31, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, 38, 39, 40, 41, 41, 42, 43, 43, 44, 45, 46, + 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, 19, 19, 20, // Mode disp 20 + 21, 21, 22, 23, 23, 24, 24, 25, 26, 26, 27, 28, 28, 29, 29, 30, 31, 31, 32, 33, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 39, 40, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 15, 15, 16, 16, 17, 17, 18, // Mode disp 21 + 18, 19, 20, 20, 21, 21, 22, 22, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 29, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 35, 35, 36, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, // Mode disp 22 + 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, // Mode disp 23 + 14, 15, 15, 16, 16, 17, 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 25, 26, 26, 27, 27, 28, 28, + 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, // Mode disp 24 + 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, 24, + 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, // Mode disp 25 + 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20, 20, + 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, // Mode disp 26 + 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, + 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, // Mode disp 27 + 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, + 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, // Mode disp 28 + 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, // Mode disp 29 + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, // Mode disp 30 + 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // Mode disp 31 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + + #endif INTRA_AVX2_TABLES_H From 71676b36e8a2508a98c75eb0e7462bbf81c72c8a Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 4 Mar 2024 22:47:53 +0200 Subject: [PATCH 098/237] Load pre-calculated shifted inv angle sums from table (vertical modes). --- src/strategies/avx2/intra-avx2.c | 34 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index fdea4625..9337d9ec 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2098,7 +2098,7 @@ static void angular_pdpc_ver_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; int16_t left[4][4]; @@ -2113,13 +2113,15 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int offset = scale * 16; const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_ver_weight[offset]); + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { for (int xx = 0; xx < width; ++xx) { - int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; + left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; } } @@ -2141,7 +2143,7 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 8; @@ -2154,13 +2156,16 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int offset = scale * 16; const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_ver_weight[offset]); + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + // For width 8, height must be at least 2. Handle 2 lines at once. for (int y = 0; y < height; y += 2) { ALIGNED(32) int16_t left[16] = {0}; for (int xx = 0; xx < limit; ++xx) { - int shifted_inv_angle_sum = (256 + (xx + 1) * inv_sample_disp) >> 9; for (int yy = 0; yy < 2; ++yy) { - left[yy * width +xx] = ref_side[(y + yy) + shifted_inv_angle_sum + 1]; + left[yy * width +xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; } } @@ -2182,7 +2187,7 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) +static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) { int limit = MIN(3 << scale, width); __m256i v32s = _mm256_set1_epi16(32); @@ -2190,12 +2195,15 @@ static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int offset = scale * 16; const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w16_ver_weight[offset]); + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + for (int y = 0; y < height; ++y) { for (int x = 0; x < limit; x += 16) { ALIGNED(32) int16_t left[16] = {0}; for (int xx = 0; x + xx < limit; ++xx) { - int shifted_inv_angle_sum = (256 + (x + xx + 1) * inv_sample_disp) >> 9; - left[xx] = ref_side[y + shifted_inv_angle_sum + 1]; + left[xx] = ref_side[y + shifted_inv_angle_sum[xx] + 1]; } __m128i vdst = _mm_load_si128((const __m128i*)(dst + (y * width + x))); @@ -2855,11 +2863,11 @@ static void uvg_angular_pred_avx2( if (PDPC_filter) { if (vertical_mode) switch (width) { - case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; - case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, width, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, width, scale, mode_disp); break; + case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, width, scale, mode_disp); break; case 16: // 16 height and higher done with the same function case 32: - case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, modedisp2invsampledisp[abs(mode_disp)]); break; + case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; default: assert(false && "Intra PDPC: Invalid width.\n"); } From 3bb3fa8c96207a1b5a2898f6d88579ef0396e8e5 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 5 Mar 2024 00:06:33 +0200 Subject: [PATCH 099/237] Implement scale 0 special case for w16 vertical pdpc. --- src/strategies/avx2/intra-avx2.c | 68 ++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9337d9ec..8ae51121 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2225,6 +2225,57 @@ static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } +static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w4 function, retrofitted to work with width 16 and up when scale is 0. + // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. + const int scale = 0; + int16_t left[4][4]; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 3; + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w4_ver_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + for (int xx = 0; xx < 4; ++xx) { + for (int yy = 0; yy < 4; ++yy) { + left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + } + + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * 4), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(filtered, 3); + } +} + // Height versions of vertical PDPC @@ -2861,17 +2912,27 @@ static void uvg_angular_pred_avx2( } } if (PDPC_filter) { - if (vertical_mode) + if (vertical_mode) { switch (width) { case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, width, scale, mode_disp); break; case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, width, scale, mode_disp); break; case 16: // 16 height and higher done with the same function case 32: - case 64: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + case 64: + switch (scale) { + case 0: angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); break; + //case 0: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + case 1: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + case 2: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + default: + assert(false && "Intra PDPC: Invalid scale.\n"); + } + break; default: assert(false && "Intra PDPC: Invalid width.\n"); } - else + } + else { switch (width) { case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); break; case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; @@ -2881,6 +2942,7 @@ static void uvg_angular_pred_avx2( default: assert(false && "Intra PDPC: Invalid width.\n"); } + } } } } From 7be091d0f687d43a503e96353cfb6f66d60af56c Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 5 Mar 2024 15:48:52 +0200 Subject: [PATCH 100/237] Implement functions for different scales for vertical pdpc. Scale determines the amount of zero weights. Limiting the work based on scale limits the amount of unnecessary work done. --- src/strategies/avx2/intra-avx2.c | 73 ++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8ae51121..2fe36a09 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2187,10 +2187,11 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { - int limit = MIN(3 << scale, width); __m256i v32s = _mm256_set1_epi16(32); + const int scale = 2; // Other functions handle scales 0 and 1 + int limit = 12; // With scale 2, limit is always 12. const int offset = scale * 16; const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w16_ver_weight[offset]); @@ -2229,6 +2230,7 @@ static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re { // NOTE: This function is just the w4 function, retrofitted to work with width 16 and up when scale is 0. // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. + // NOTE: This function also works with width 8 when scale is 0, the name w16 might be a bit misleading. const int scale = 0; int16_t left[4][4]; const int log2_width = uvg_g_convert_to_log2[width]; @@ -2255,7 +2257,7 @@ static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re } } - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * 4), vidx, 1); + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2276,6 +2278,53 @@ static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re } } +static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + const int scale = 1; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m128i vseq = _mm_set_epi64x(1, 0); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_ver_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + int16_t shifted_inv_angle_sum[64]; + memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + ALIGNED(32) int16_t left[16] = { 0 }; + for (int yy = 0; yy < 2; ++yy) { + for (int xx = 0; xx < limit; ++xx) { + left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + } + + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); + } +} // Height versions of vertical PDPC @@ -2914,16 +2963,22 @@ static void uvg_angular_pred_avx2( if (PDPC_filter) { if (vertical_mode) { switch (width) { - case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, width, scale, mode_disp); break; - case 8: angular_pdpc_ver_w8_avx2(dst, ref_side, width, scale, mode_disp); break; - case 16: // 16 height and higher done with the same function + case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); break; + case 8: + if (scale == 0) { + angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); // Special case for scale 0. Use the w16_scale0 function since it works with w8 also. + } + else { + angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); + } + break; + case 16: // 16 width and higher done with the same function case 32: case 64: switch (scale) { case 0: angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); break; - //case 0: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; - case 1: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; - case 2: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + case 1: angular_pdpc_ver_w16_scale1_avx2(dst, ref_side, width, height, mode_disp); break; + case 2: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); break; default: assert(false && "Intra PDPC: Invalid scale.\n"); } From 6a9bbec1974adb5fdbf89b0011219249f0c7a3ac Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 6 Mar 2024 14:04:45 +0200 Subject: [PATCH 101/237] Replace inv angle sum loads with a simple pointer assignment. --- src/strategies/avx2/intra-avx2.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 2fe36a09..4ba6c42a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2114,8 +2114,7 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_ver_weight[offset]); const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { @@ -2157,8 +2156,7 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_ver_weight[offset]); const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; // For width 8, height must be at least 2. Handle 2 lines at once. for (int y = 0; y < height; y += 2) { @@ -2197,8 +2195,7 @@ static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w16_ver_weight[offset]); const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; for (int y = 0; y < height; ++y) { for (int x = 0; x < limit; x += 16) { @@ -2295,8 +2292,7 @@ static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_ver_weight[offset]); const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t *shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; // For width 8, height must be at least 2. Handle 2 lines at once. for (int y = 0; y < height; y += 2) { @@ -2558,8 +2554,7 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Scale can be 0, 1 or 2 const int table_offset = scale * 64; const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { @@ -2601,8 +2596,7 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Scale can be 0, 1 or 2 const int table_offset = scale * 128; const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { const __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_hor_weight[o]); @@ -2636,8 +2630,7 @@ static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, __m256i v32s = _mm256_set1_epi16(32); const int inv_angle_offset = mode_disp * 64; - int16_t shifted_inv_angle_sum[64]; - memcpy(shifted_inv_angle_sum, &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset], height * sizeof(int16_t)); // TODO: would this be faster if the max amount (64) would be always loaded? + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; // Handle one line at a time. Skip line if vertical limit reached. for (int y = 0; y < limit; ++y) { From 17346455023870240ee6924e8d07bcdd00de4d5f Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 11 Mar 2024 21:05:06 +0200 Subject: [PATCH 102/237] Add option for intra exhaustive test. Tests all mode and block size combinations. --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 4ba6c42a..fdfa1dfd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2684,8 +2684,8 @@ static void uvg_angular_pred_avx2( assert(log2_height >= 1); } - // Modes [-1, -14] and [67, 80] are wide angle modes - assert(intra_mode >= -14 && intra_mode <= 80); + // Modes [-1, -14] and [67, 81] are wide angle modes + assert(intra_mode >= -14 && intra_mode <= 81); uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; uint8_t isp = isp_mode; From cb23edfa09d50c4719f1a9393e2d3e3989319047 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 6 Mar 2024 15:01:19 +0200 Subject: [PATCH 103/237] Replace x-loop in pdpc horizontal w4 AGAIN. This time make it work by extending the loaded 8-bit values. --- src/strategies/avx2/intra-avx2.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index fdfa1dfd..bc00956a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2543,7 +2543,7 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width = 4; int16_t wT[4]; - int16_t ref_top[4][4]; + int8_t ref_top[4][4]; int limit = MIN(3 << scale, height); @@ -2558,18 +2558,17 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, for (int y = 0, o = 0; y < limit; y += 4, o += 16) { for (int yy = 0; yy < 4; ++yy) { - for (int x = 0; x < 4; ++x) { - ref_top[yy][x] = ref_side[x + shifted_inv_angle_sum[y + yy] + 1]; // TODO: this can be done with a 32-bit gather. NOTE: 8-bit values must be extended to 16-bit. - } + memcpy(ref_top[yy], &ref_side[shifted_inv_angle_sum[y + yy] + 1], 4 * sizeof(int8_t)); } const int offset = table_offset + o; __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); + __m128i vtop = _mm_loadu_si128((__m128i*)ref_top); + __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_hor_weight[offset]); - __m256i accu = _mm256_sub_epi16(vtop, vpred16); + __m256i accu = _mm256_sub_epi16(vtop16, vpred16); accu = _mm256_mullo_epi16(vwT, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); From b178eac466118c3dcef7b97f0772ad8efbe4408a Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 6 Mar 2024 18:03:09 +0200 Subject: [PATCH 104/237] Improve intra pdpc w4 vertical and horizontal. Improve load times by utilising load + shuffle whenever possible. --- src/strategies/avx2/intra-avx2.c | 146 ++++++- src/strategies/avx2/intra_avx2_tables.h | 558 +++++++++++++++++++++++- 2 files changed, 693 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index bc00956a..9f9da926 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2322,7 +2322,67 @@ static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re } } -// Height versions of vertical PDPC + +static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + const int width = 4; + int16_t left[4][4]; + + int limit = MIN(3 << scale, width); + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2 width + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); + const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_w4_ver[shuf_offset]); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + for (int xx = 0; xx < width; ++xx) { + for (int yy = 0; yy < 4; ++yy) { + left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + } + __m128i vleft = _mm_loadu_si128((__m128i*)&ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_ver_w8_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + +} + +static void angular_pdpc_ver_w16_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) +{ + +} + + +// Height versions of vertical PDPC, these are unused but left here for archiving purposes. Maybe this method can be refined to be effective. static void angular_pdpc_ver_h4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int scale, const int16_t inv_sample_disp) { @@ -2658,6 +2718,57 @@ static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } +static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + const int width = 4; + + int16_t wT[4]; + int8_t ref_top[4][4]; + + int limit = MIN(3 << scale, height); + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int table_offset = scale * 64; + const int shuf_offset = mode_disp * 256; + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + for (int y = 0, o = 0; y < limit; y += 4, o += 16) { + const __m128i vshuf = _mm_loadu_si128((__m128i*)&intra_pdpc_shuffle_vectors_w4_hor[shuf_offset + o]); + /*for (int yy = 0; yy < 4; ++yy) { + memcpy(ref_top[yy], &ref_side[shifted_inv_angle_sum[y + yy] + 1], 4 * sizeof(int8_t)); + }*/ + + __m128i vtop = _mm_loadu_si128((__m128i*)&ref_side[shifted_inv_angle_sum[y] + 1]); + vtop = _mm_shuffle_epi8(vtop, vshuf); + + const int offset = table_offset + o; + + __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + //__m128i vtop = _mm_loadu_si128((__m128i*)ref_top); + __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); + __m256i vwT = _mm256_load_si256((const __m256i*) & intra_pdpc_w4_hor_weight[offset]); + + __m256i accu = _mm256_sub_epi16(vtop16, vpred16); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + + static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, @@ -2954,9 +3065,17 @@ static void uvg_angular_pred_avx2( } if (PDPC_filter) { if (vertical_mode) { + // Note: no need to check for negative mode_disp, since it is already checked before. switch (width) { - case 4: angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); break; - case 8: + case 4: + // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. + // Handle high angles with more tight reference spacing with separate functions with more optimized loads. + if (mode_disp < 6) + angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + break; + case 8: if (scale == 0) { angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); // Special case for scale 0. Use the w16_scale0 function since it works with w8 also. } @@ -2966,13 +3085,13 @@ static void uvg_angular_pred_avx2( break; case 16: // 16 width and higher done with the same function case 32: - case 64: + case 64: switch (scale) { - case 0: angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); break; - case 1: angular_pdpc_ver_w16_scale1_avx2(dst, ref_side, width, height, mode_disp); break; - case 2: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); break; - default: - assert(false && "Intra PDPC: Invalid scale.\n"); + case 0: angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); break; + case 1: angular_pdpc_ver_w16_scale1_avx2(dst, ref_side, width, height, mode_disp); break; + case 2: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); break; + default: + assert(false && "Intra PDPC: Invalid scale.\n"); } break; default: @@ -2981,7 +3100,14 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); break; + case 4: + // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. + // Handle high angles with more tight reference spacing with separate functions with more optimized loads. + if (mode_disp < 6) + angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_hor_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + break; case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; case 16: // 16 width and higher done with the same function case 32: diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 770174ca..16d62459 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1347,7 +1347,8 @@ ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { }; -// Pre-calculated shifted inverse angle sums for pdpc for y-values [0, 64]. Grouped by mode_disp. +// Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. +// Index by y or x based on pdpc direction. ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Mode disp 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1416,4 +1417,559 @@ ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { }; +// TODO: prune this table. These is a ton of duplicates. Pruning may introduce some extra logic, but it will save a lot of space and probably speed up memory access. +// NOTE: The vectors from this table can be only used up from mode disp 6. The reference samples are too sparse for vectorized shuffle below mode disp 6. +// Shuffle vectors for w4 horizontal pdpc. +ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_hor[] = { + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, // Mode disp 0 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, // Mode disp 1 + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x020, 0x021, 0x022, 0x023, 0x040, 0x041, 0x042, 0x043, 0x060, 0x061, 0x062, 0x063, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, // Mode disp 2 + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x010, 0x011, 0x012, 0x013, 0x020, 0x021, 0x022, 0x023, 0x030, 0x031, 0x032, 0x033, + 0x000, 0x001, 0x002, 0x003, 0x00a, 0x00b, 0x00c, 0x00d, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, // Mode disp 3 + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x016, 0x017, 0x018, 0x019, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00a, 0x00b, 0x00c, 0x00d, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x016, 0x017, 0x018, 0x019, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00a, 0x00b, 0x00c, 0x00d, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x016, 0x017, 0x018, 0x019, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00a, 0x00b, 0x00c, 0x00d, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x016, 0x017, 0x018, 0x019, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00a, 0x00b, 0x00c, 0x00d, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x016, 0x017, 0x018, 0x019, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00b, 0x00c, 0x00d, 0x00e, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x00a, 0x00b, 0x00c, 0x00d, 0x015, 0x016, 0x017, 0x018, 0x020, 0x021, 0x022, 0x023, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, // Mode disp 4 + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x008, 0x009, 0x00a, 0x00b, 0x010, 0x011, 0x012, 0x013, 0x018, 0x019, 0x01a, 0x01b, + 0x000, 0x001, 0x002, 0x003, 0x006, 0x007, 0x008, 0x009, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, // Mode disp 5 + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00a, 0x00b, 0x00c, 0x00d, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x006, 0x007, 0x008, 0x009, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00a, 0x00b, 0x00c, 0x00d, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x006, 0x007, 0x008, 0x009, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00a, 0x00b, 0x00c, 0x00d, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x006, 0x007, 0x008, 0x009, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00a, 0x00b, 0x00c, 0x00d, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x006, 0x007, 0x008, 0x009, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00a, 0x00b, 0x00c, 0x00d, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x006, 0x007, 0x008, 0x009, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x013, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, // Mode disp 6 + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, // Mode disp 7 + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x009, 0x00a, 0x00b, 0x00c, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x009, 0x00a, 0x00b, 0x00c, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x009, 0x00a, 0x00b, 0x00c, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x009, 0x00a, 0x00b, 0x00c, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x009, 0x00a, 0x00b, 0x00c, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x009, 0x00a, 0x00b, 0x00c, + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, 0x00d, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, // Mode disp 8 + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00b, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, // Mode disp 9 + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, 0x007, 0x008, 0x009, 0x00a, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, // Mode disp 10 + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, // Mode disp 11 + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x006, 0x007, 0x008, 0x009, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, // Mode disp 12 + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, // Mode disp 13 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x005, 0x006, 0x007, 0x008, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, // Mode disp 14 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 15 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x003, 0x004, 0x005, 0x006, 0x004, 0x005, 0x006, 0x007, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 16 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 17 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, // Mode disp 18 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, // Mode disp 19 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, // Mode disp 20 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, // Mode disp 21 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, // Mode disp 22 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, // Mode disp 23 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, // Mode disp 24 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, // Mode disp 25 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, // Mode disp 26 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, // Mode disp 27 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, // Mode disp 28 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, // Mode disp 29 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, // Mode disp 30 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, // Mode disp 31 + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, + 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, +}; + + +// Shuffle vectors for w4 vertical pdpc. +ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_ver[] = { + 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 0 + 0x000, 0x020, 0x040, 0x060, 0x001, 0x021, 0x041, 0x061, 0x002, 0x022, 0x042, 0x062, 0x003, 0x023, 0x043, 0x063, // Mode disp 1 + 0x000, 0x010, 0x020, 0x030, 0x001, 0x011, 0x021, 0x031, 0x002, 0x012, 0x022, 0x032, 0x003, 0x013, 0x023, 0x033, // Mode disp 2 + 0x000, 0x00a, 0x015, 0x020, 0x001, 0x00b, 0x016, 0x021, 0x002, 0x00c, 0x017, 0x022, 0x003, 0x00d, 0x018, 0x023, // Mode disp 3 + 0x000, 0x008, 0x010, 0x018, 0x001, 0x009, 0x011, 0x019, 0x002, 0x00a, 0x012, 0x01a, 0x003, 0x00b, 0x013, 0x01b, // Mode disp 4 + 0x000, 0x006, 0x00b, 0x010, 0x001, 0x007, 0x00c, 0x011, 0x002, 0x008, 0x00d, 0x012, 0x003, 0x009, 0x00e, 0x013, // Mode disp 5 + 0x000, 0x004, 0x008, 0x00c, 0x001, 0x005, 0x009, 0x00d, 0x002, 0x006, 0x00a, 0x00e, 0x003, 0x007, 0x00b, 0x00f, // Mode disp 6 + 0x000, 0x003, 0x007, 0x00a, 0x001, 0x004, 0x008, 0x00b, 0x002, 0x005, 0x009, 0x00c, 0x003, 0x006, 0x00a, 0x00d, // Mode disp 7 + 0x000, 0x002, 0x005, 0x008, 0x001, 0x003, 0x006, 0x009, 0x002, 0x004, 0x007, 0x00a, 0x003, 0x005, 0x008, 0x00b, // Mode disp 8 + 0x000, 0x003, 0x005, 0x007, 0x001, 0x004, 0x006, 0x008, 0x002, 0x005, 0x007, 0x009, 0x003, 0x006, 0x008, 0x00a, // Mode disp 9 + 0x000, 0x002, 0x004, 0x006, 0x001, 0x003, 0x005, 0x007, 0x002, 0x004, 0x006, 0x008, 0x003, 0x005, 0x007, 0x009, // Mode disp 10 + 0x000, 0x002, 0x003, 0x005, 0x001, 0x003, 0x004, 0x006, 0x002, 0x004, 0x005, 0x007, 0x003, 0x005, 0x006, 0x008, // Mode disp 11 + 0x000, 0x001, 0x003, 0x004, 0x001, 0x002, 0x004, 0x005, 0x002, 0x003, 0x005, 0x006, 0x003, 0x004, 0x006, 0x007, // Mode disp 12 + 0x000, 0x002, 0x003, 0x005, 0x001, 0x003, 0x004, 0x006, 0x002, 0x004, 0x005, 0x007, 0x003, 0x005, 0x006, 0x008, // Mode disp 13 + 0x000, 0x001, 0x003, 0x004, 0x001, 0x002, 0x004, 0x005, 0x002, 0x003, 0x005, 0x006, 0x003, 0x004, 0x006, 0x007, // Mode disp 14 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 15 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 16 + 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 17 + 0x000, 0x001, 0x001, 0x002, 0x001, 0x002, 0x002, 0x003, 0x002, 0x003, 0x003, 0x004, 0x003, 0x004, 0x004, 0x005, // Mode disp 18 + 0x000, 0x000, 0x001, 0x002, 0x001, 0x001, 0x002, 0x003, 0x002, 0x002, 0x003, 0x004, 0x003, 0x003, 0x004, 0x005, // Mode disp 19 + 0x000, 0x000, 0x001, 0x002, 0x001, 0x001, 0x002, 0x003, 0x002, 0x002, 0x003, 0x004, 0x003, 0x003, 0x004, 0x005, // Mode disp 20 + 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, // Mode disp 21 + 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, // Mode disp 22 + 0x000, 0x001, 0x001, 0x002, 0x001, 0x002, 0x002, 0x003, 0x002, 0x003, 0x003, 0x004, 0x003, 0x004, 0x004, 0x005, // Mode disp 23 + 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 24 + 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 25 + 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 26 + 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, // Mode disp 27 + 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, // Mode disp 28 + 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 29 + 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 30 + 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 31 +}; + #endif INTRA_AVX2_TABLES_H From ace8178773fe6dd2e4c284a4f07c2fb578fdb1ed Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 8 Mar 2024 16:03:38 +0200 Subject: [PATCH 105/237] Improve intra vertical pdpc memory handling with special handling for different scales and high angles. Add table generation scripts. --- src/strategies/avx2/intra-avx2.c | 363 +++++++++++++++++++++++- src/strategies/avx2/intra_avx2_tables.h | 112 ++++++++ 2 files changed, 465 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9f9da926..4fadae6f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2142,6 +2142,7 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } + static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 8; @@ -2185,6 +2186,13 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } + +static void angular_pdpc_ver_w8_high_angle_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int mode_disp) +{ + +} + + static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m256i v32s = _mm256_set1_epi16(32); @@ -2322,6 +2330,48 @@ static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re } } +static void angular_pdpc_ver_w16_scale2_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + __m256i v32s = _mm256_set1_epi16(32); + const int scale = 2; // Other functions handle scales 0 and 1 + int limit = 12; // With scale 2, limit is always 12. + + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w16_ver_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_load_si128((const __m128i*) &intra_pdpc_shuffle_vectors_w16_scale2_ver[shuf_offset]); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < limit; x += 16) { + /*ALIGNED(32) int16_t left[16] = { 0 }; + for (int xx = 0; x + xx < limit; ++xx) { + left[xx] = ref_side[y + shifted_inv_angle_sum[xx] + 1]; + }*/ + __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_load_si128((const __m128i*)(dst + (y * width + x))); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width + x)), filtered); + } + } +} + static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { @@ -2345,11 +2395,11 @@ static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { - for (int xx = 0; xx < width; ++xx) { + /*for (int xx = 0; xx < width; ++xx) { for (int yy = 0; yy < 4; ++yy) { left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; } - } + }*/ __m128i vleft = _mm_loadu_si128((__m128i*)&ref_side[y + shifted_inv_angle_sum[0] + 1]); vleft = _mm_shuffle_epi8(vleft, vshuf); @@ -2371,14 +2421,280 @@ static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } } -static void angular_pdpc_ver_w8_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) + +static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { + // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. + // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. + const int scale = 0; + int16_t left[4][4]; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 3; + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + for (int xx = 0; xx < 4; ++xx) { + for (int yy = 0; yy < 4; ++yy) { + left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + } + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(filtered, 3); + } } -static void angular_pdpc_ver_w16_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_4x4_scale0_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { + // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. + // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. + // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. + const int scale = 0; + int16_t left[4][4]; + const int log2_width = uvg_g_convert_to_log2[width]; + const int limit = 3; + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_w4_ver[shuf_offset]); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(filtered, 0); + *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(filtered, 1); + *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(filtered, 2); + *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(filtered, 3); + } +} + + +static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + const int scale = 1; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m128i vseq = _mm_set_epi64x(1, 0); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_loadu_si128((__m128i*) & intra_pdpc_shuffle_vectors_8x2_scale1_ver[shuf_offset]); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + /*ALIGNED(32) int16_t left[16] = { 0 }; + for (int yy = 0; yy < 2; ++yy) { + for (int xx = 0; xx < limit; ++xx) { + left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + }*/ + __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + //__m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. + // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity + if (width == 8) { + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } + else { + *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); + } + } +} + +static void angular_pdpc_ver_8x2_scale2_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. + const int scale = 1; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m128i vseq = _mm_set_epi64x(1, 0); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_loadu_si128((__m128i*) & intra_pdpc_shuffle_vectors_8x2_scale2_ver[shuf_offset]); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + /*ALIGNED(32) int16_t left[16] = { 0 }; + for (int yy = 0; yy < 2; ++yy) { + for (int xx = 0; xx < limit; ++xx) { + left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + }*/ + __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + //__m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. + // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity + if (width == 8) { + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } + else { + *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); + } + } +} + +static void angular_pdpc_ver_8x2_scale1_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. + const int scale = 1; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m128i vseq = _mm_set_epi64x(1, 0); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + ALIGNED(32) int16_t left[16] = { 0 }; + for (int yy = 0; yy < 2; ++yy) { + for (int xx = 0; xx < limit; ++xx) { + left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + } + + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. + // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity + if (width == 8) { + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } + else { + *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); + } + } } @@ -3077,19 +3393,46 @@ static void uvg_angular_pred_avx2( break; case 8: if (scale == 0) { - angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); // Special case for scale 0. Use the w16_scale0 function since it works with w8 also. + if (mode_disp < 6) + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + } + else if (scale == 1) { + if (mode_disp < 8) + angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); } else { - angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); + if (mode_disp < 10) + angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_ver_8x2_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); } break; - case 16: // 16 width and higher done with the same function + case 16: // 16 width and higher done with the same functions case 32: case 64: switch (scale) { - case 0: angular_pdpc_ver_w16_scale0_avx2(dst, ref_side, width, height, mode_disp); break; - case 1: angular_pdpc_ver_w16_scale1_avx2(dst, ref_side, width, height, mode_disp); break; - case 2: angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); break; + case 0: + if (mode_disp < 6) + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + break; + case 1: + if (mode_disp < 8) + angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); + break; + case 2: + if (mode_disp < 14) + angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_w16_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); + break; default: assert(false && "Intra PDPC: Invalid scale.\n"); } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 16d62459..7829e842 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1972,4 +1972,116 @@ ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_ver[] = { 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 31 }; + +// Shuffle vectors for 8x2 scale 1 vertical pdpc. 0xfff entries are "don't care", those will be zeroed out by zero weights +// These are basically same as the 8x2 scale2 vectors, but with added "don't care" entries. This table can be safely removed. +ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale1_ver[] = { + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 0 + 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0xfff, 0xfff, 0x001, 0x021, 0x041, 0x061, 0x081, 0x0a1, 0xfff, 0xfff, // Mode disp 1 + 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0xfff, 0xfff, 0x001, 0x011, 0x021, 0x031, 0x041, 0x051, 0xfff, 0xfff, // Mode disp 2 + 0x000, 0x00a, 0x015, 0x020, 0x02a, 0x035, 0xfff, 0xfff, 0x001, 0x00b, 0x016, 0x021, 0x02b, 0x036, 0xfff, 0xfff, // Mode disp 3 + 0x000, 0x008, 0x010, 0x018, 0x020, 0x028, 0xfff, 0xfff, 0x001, 0x009, 0x011, 0x019, 0x021, 0x029, 0xfff, 0xfff, // Mode disp 4 + 0x000, 0x006, 0x00b, 0x010, 0x016, 0x01b, 0xfff, 0xfff, 0x001, 0x007, 0x00c, 0x011, 0x017, 0x01c, 0xfff, 0xfff, // Mode disp 5 + 0x000, 0x004, 0x008, 0x00c, 0x010, 0x014, 0xfff, 0xfff, 0x001, 0x005, 0x009, 0x00d, 0x011, 0x015, 0xfff, 0xfff, // Mode disp 6 + 0x000, 0x003, 0x007, 0x00a, 0x00d, 0x010, 0xfff, 0xfff, 0x001, 0x004, 0x008, 0x00b, 0x00e, 0x011, 0xfff, 0xfff, // Mode disp 7 + 0x000, 0x002, 0x005, 0x008, 0x00a, 0x00d, 0xfff, 0xfff, 0x001, 0x003, 0x006, 0x009, 0x00b, 0x00e, 0xfff, 0xfff, // Mode disp 8 + 0x000, 0x003, 0x005, 0x007, 0x009, 0x00c, 0xfff, 0xfff, 0x001, 0x004, 0x006, 0x008, 0x00a, 0x00d, 0xfff, 0xfff, // Mode disp 9 + 0x000, 0x002, 0x004, 0x006, 0x008, 0x00a, 0xfff, 0xfff, 0x001, 0x003, 0x005, 0x007, 0x009, 0x00b, 0xfff, 0xfff, // Mode disp 10 + 0x000, 0x002, 0x003, 0x005, 0x007, 0x009, 0xfff, 0xfff, 0x001, 0x003, 0x004, 0x006, 0x008, 0x00a, 0xfff, 0xfff, // Mode disp 11 + 0x000, 0x001, 0x003, 0x004, 0x006, 0x008, 0xfff, 0xfff, 0x001, 0x002, 0x004, 0x005, 0x007, 0x009, 0xfff, 0xfff, // Mode disp 12 + 0x000, 0x002, 0x003, 0x005, 0x006, 0x007, 0xfff, 0xfff, 0x001, 0x003, 0x004, 0x006, 0x007, 0x008, 0xfff, 0xfff, // Mode disp 13 + 0x000, 0x001, 0x003, 0x004, 0x005, 0x006, 0xfff, 0xfff, 0x001, 0x002, 0x004, 0x005, 0x006, 0x007, 0xfff, 0xfff, // Mode disp 14 + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0xfff, 0xfff, 0x001, 0x002, 0x003, 0x004, 0x006, 0x007, 0xfff, 0xfff, // Mode disp 15 + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0xfff, 0xfff, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0xfff, 0xfff, // Mode disp 16 + 0x000, 0x001, 0x002, 0x003, 0x004, 0x004, 0xfff, 0xfff, 0x001, 0x002, 0x003, 0x004, 0x005, 0x005, 0xfff, 0xfff, // Mode disp 17 + 0x000, 0x001, 0x001, 0x002, 0x003, 0x004, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x003, 0x004, 0x005, 0xfff, 0xfff, // Mode disp 18 + 0x000, 0x000, 0x001, 0x002, 0x003, 0x003, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x003, 0x004, 0x004, 0xfff, 0xfff, // Mode disp 19 + 0x000, 0x000, 0x001, 0x002, 0x002, 0x003, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x003, 0x003, 0x004, 0xfff, 0xfff, // Mode disp 20 + 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 21 + 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 22 + 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0xfff, 0xfff, // Mode disp 23 + 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 24 + 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 25 + 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0xfff, 0xfff, // Mode disp 26 + 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0xfff, 0xfff, // Mode disp 27 + 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0xfff, 0xfff, // Mode disp 28 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0xfff, 0xfff, // Mode disp 29 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 30 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 31 +}; + + +// Shuffle vectors for 8x2 scale 2 vertical pdpc. +ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale2_ver[] = { + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 0 -- Unused + 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0x0c0, 0x0e0, 0x001, 0x021, 0x041, 0x061, 0x081, 0x0a1, 0x0c1, 0x0e1, // Mode disp 1 * + 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x001, 0x011, 0x021, 0x031, 0x041, 0x051, 0x061, 0x071, // Mode disp 2 * + 0x000, 0x00a, 0x015, 0x020, 0x02a, 0x035, 0x040, 0x04a, 0x001, 0x00b, 0x016, 0x021, 0x02b, 0x036, 0x041, 0x04b, // Mode disp 3 * + 0x000, 0x008, 0x010, 0x018, 0x020, 0x028, 0x030, 0x038, 0x001, 0x009, 0x011, 0x019, 0x021, 0x029, 0x031, 0x039, // Mode disp 4 * + 0x000, 0x006, 0x00b, 0x010, 0x016, 0x01b, 0x020, 0x026, 0x001, 0x007, 0x00c, 0x011, 0x017, 0x01c, 0x021, 0x027, // Mode disp 5 * + 0x000, 0x004, 0x008, 0x00c, 0x010, 0x014, 0x018, 0x01c, 0x001, 0x005, 0x009, 0x00d, 0x011, 0x015, 0x019, 0x01d, // Mode disp 6 * + 0x000, 0x003, 0x007, 0x00a, 0x00d, 0x010, 0x013, 0x017, 0x001, 0x004, 0x008, 0x00b, 0x00e, 0x011, 0x014, 0x018, // Mode disp 7 * + 0x000, 0x002, 0x005, 0x008, 0x00a, 0x00d, 0x010, 0x012, 0x001, 0x003, 0x006, 0x009, 0x00b, 0x00e, 0x011, 0x013, // Mode disp 8 * + 0x000, 0x003, 0x005, 0x007, 0x009, 0x00c, 0x00e, 0x010, 0x001, 0x004, 0x006, 0x008, 0x00a, 0x00d, 0x00f, 0x011, // Mode disp 9 -- Unused + 0x000, 0x002, 0x004, 0x006, 0x008, 0x00a, 0x00c, 0x00e, 0x001, 0x003, 0x005, 0x007, 0x009, 0x00b, 0x00d, 0x00f, // Mode disp 10 + 0x000, 0x002, 0x003, 0x005, 0x007, 0x009, 0x00a, 0x00c, 0x001, 0x003, 0x004, 0x006, 0x008, 0x00a, 0x00b, 0x00d, // Mode disp 11 + 0x000, 0x001, 0x003, 0x004, 0x006, 0x008, 0x009, 0x00b, 0x001, 0x002, 0x004, 0x005, 0x007, 0x009, 0x00a, 0x00c, // Mode disp 12 + 0x000, 0x002, 0x003, 0x005, 0x006, 0x007, 0x009, 0x00a, 0x001, 0x003, 0x004, 0x006, 0x007, 0x008, 0x00a, 0x00b, // Mode disp 13 + 0x000, 0x001, 0x003, 0x004, 0x005, 0x006, 0x008, 0x009, 0x001, 0x002, 0x004, 0x005, 0x006, 0x007, 0x009, 0x00a, // Mode disp 14 + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x001, 0x002, 0x003, 0x004, 0x006, 0x007, 0x008, 0x009, // Mode disp 15 + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, // Mode disp 16 + 0x000, 0x001, 0x002, 0x003, 0x004, 0x004, 0x005, 0x006, 0x001, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, // Mode disp 17 + 0x000, 0x001, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x001, 0x002, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, // Mode disp 18 + 0x000, 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x001, 0x001, 0x002, 0x003, 0x004, 0x004, 0x005, 0x006, // Mode disp 19 + 0x000, 0x000, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x001, 0x001, 0x002, 0x003, 0x003, 0x004, 0x004, 0x005, // Mode disp 20 + 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, // Mode disp 21 + 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, // Mode disp 22 + 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, 0x005, // Mode disp 23 + 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, // Mode disp 24 + 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x003, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x004, // Mode disp 25 + 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, // Mode disp 26 + 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, 0x003, // Mode disp 27 + 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, // Mode disp 28 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, // Mode disp 29 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, // Mode disp 30 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 31 +}; + + +// Shuffle vectors for w16 scale 2 vertical pdpc. +ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // Mode disp 0 -- Unused + 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0x0c0, 0x0e0, 0x100, 0x120, 0x140, 0x160, 0x180, 0x1a0, 0x1c0, 0x1e0, // Mode disp 1 * + 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x080, 0x090, 0x0a0, 0x0b0, 0x0c0, 0x0d0, 0x0e0, 0x0f0, // Mode disp 2 * + 0x000, 0x00a, 0x015, 0x020, 0x02a, 0x035, 0x040, 0x04a, 0x055, 0x060, 0x06a, 0x075, 0x080, 0x08a, 0x095, 0x0a0, // Mode disp 3 * + 0x000, 0x008, 0x010, 0x018, 0x020, 0x028, 0x030, 0x038, 0x040, 0x048, 0x050, 0x058, 0x060, 0x068, 0x070, 0x078, // Mode disp 4 * + 0x000, 0x006, 0x00b, 0x010, 0x016, 0x01b, 0x020, 0x026, 0x02b, 0x030, 0x036, 0x03b, 0x040, 0x046, 0x04b, 0x050, // Mode disp 5 * + 0x000, 0x004, 0x008, 0x00c, 0x010, 0x014, 0x018, 0x01c, 0x020, 0x024, 0x028, 0x02c, 0x030, 0x034, 0x038, 0x03c, // Mode disp 6 * + 0x000, 0x003, 0x007, 0x00a, 0x00d, 0x010, 0x013, 0x017, 0x01a, 0x01d, 0x020, 0x023, 0x027, 0x02a, 0x02d, 0x030, // Mode disp 7 * + 0x000, 0x002, 0x005, 0x008, 0x00a, 0x00d, 0x010, 0x012, 0x015, 0x018, 0x01a, 0x01d, 0x020, 0x022, 0x025, 0x028, // Mode disp 8 * + 0x000, 0x003, 0x005, 0x007, 0x009, 0x00c, 0x00e, 0x010, 0x013, 0x015, 0x017, 0x019, 0x01c, 0x01e, 0x020, 0x023, // Mode disp 9 * + 0x000, 0x002, 0x004, 0x006, 0x008, 0x00a, 0x00c, 0x00e, 0x010, 0x012, 0x014, 0x016, 0x018, 0x01a, 0x01c, 0x01e, // Mode disp 10 * + 0x000, 0x002, 0x003, 0x005, 0x007, 0x009, 0x00a, 0x00c, 0x00e, 0x010, 0x012, 0x013, 0x015, 0x017, 0x019, 0x01a, // Mode disp 11 * + 0x000, 0x001, 0x003, 0x004, 0x006, 0x008, 0x009, 0x00b, 0x00c, 0x00e, 0x010, 0x011, 0x013, 0x014, 0x016, 0x018, // Mode disp 12 * + 0x000, 0x002, 0x003, 0x005, 0x006, 0x007, 0x009, 0x00a, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x014, 0x015, // Mode disp 13 * + 0x000, 0x001, 0x003, 0x004, 0x005, 0x006, 0x008, 0x009, 0x00a, 0x00b, 0x00d, 0x00e, 0x00f, 0x010, 0x011, 0x013, // Mode disp 14 * + 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, // Mode disp 15 -- Unused + 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, // Mode disp 16 + 0x000, 0x001, 0x002, 0x003, 0x004, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, // Mode disp 17 + 0x000, 0x001, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, // Mode disp 18 + 0x000, 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00a, // Mode disp 19 + 0x000, 0x000, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x007, 0x008, 0x008, 0x009, // Mode disp 20 + 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x005, 0x005, 0x006, 0x006, 0x007, 0x007, 0x008, // Mode disp 21 + 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, 0x005, 0x005, 0x006, 0x006, 0x007, 0x007, // Mode disp 22 + 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, 0x004, 0x005, 0x005, 0x006, 0x006, 0x007, 0x007, // Mode disp 23 + 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, 0x005, 0x005, 0x006, 0x006, // Mode disp 24 + 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, 0x005, 0x005, // Mode disp 25 + 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 26 + 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, // Mode disp 27 + 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, // Mode disp 28 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, // Mode disp 29 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 30 + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, // Mode disp 31 +}; + #endif INTRA_AVX2_TABLES_H From ddf0037cf8cae2b41849e15b61341c71a3b54e57 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 12 Mar 2024 20:14:03 +0200 Subject: [PATCH 106/237] Add mip avx2 placeholders. --- src/intra.c | 306 +------------------------ src/mip_data.h | 8 +- src/strategies/avx2/intra-avx2.c | 297 +++++++++++++++++++++++- src/strategies/generic/intra-generic.c | 295 ++++++++++++++++++++++++ src/strategies/strategies-intra.c | 1 + src/strategies/strategies-intra.h | 10 + 6 files changed, 606 insertions(+), 311 deletions(-) diff --git a/src/intra.c b/src/intra.c index 22eb93c7..3fa00a26 100644 --- a/src/intra.c +++ b/src/intra.c @@ -36,7 +36,6 @@ #include "image.h" #include "uvg_math.h" -#include "mip_data.h" #include "rdo.h" #include "search.h" #include "search_intra.h" @@ -86,17 +85,6 @@ static const uint8_t num_ref_pixels_left[16][16] = { { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 } }; - -static void mip_predict( - const encoder_state_t* const state, - const uvg_intra_references* const refs, - const uint16_t pred_block_width, - const uint16_t pred_block_height, - uvg_pixel* dst, - const int mip_mode, - const bool mip_transp); - - int8_t uvg_intra_get_dir_luma_predictor( const uint32_t x, const uint32_t y, @@ -645,298 +633,6 @@ uint8_t uvg_get_mip_flag_context( } -void uvg_mip_boundary_downsampling_1D(int* reduced_dst, const int* const ref_src, int src_len, int dst_len) -{ - if (dst_len < src_len) - { - // Create reduced boundary by downsampling - uint16_t down_smp_factor = src_len / dst_len; - const int log2_factor = uvg_math_floor_log2(down_smp_factor); - const int rounding_offset = (1 << (log2_factor - 1)); - - uint16_t src_idx = 0; - for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++) - { - int sum = 0; - for (int k = 0; k < down_smp_factor; k++) - { - sum += ref_src[src_idx++]; - } - reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor; - } - } - else - { - // Copy boundary if no downsampling is needed - for (uint16_t i = 0; i < dst_len; ++i) - { - reduced_dst[i] = ref_src[i]; - } - } -} - - -void uvg_mip_reduced_pred(int* const output, - const int* const input, - const uint8_t* matrix, - const bool transpose, - const int red_bdry_size, - const int red_pred_size, - const int size_id, - const int in_offset, - const int in_offset_tr) -{ - const int input_size = 2 * red_bdry_size; - - // Use local buffer for transposed result - int out_buf_transposed[LCU_WIDTH * LCU_WIDTH]; - int* const out_ptr = transpose ? out_buf_transposed : output; - - int sum = 0; - for (int i = 0; i < input_size; i++) { - sum += input[i]; - } - const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; - assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four"); - - const uint8_t* weight = matrix; - const int input_offset = transpose ? in_offset_tr : in_offset; - - const bool red_size = (size_id == 2); - int pos_res = 0; - for (int y = 0; y < red_pred_size; y++) { - for (int x = 0; x < red_pred_size; x++) { - if (red_size) { - weight -= 1; - } - int tmp0 = red_size ? 0 : (input[0] * weight[0]); - int tmp1 = input[1] * weight[1]; - int tmp2 = input[2] * weight[2]; - int tmp3 = input[3] * weight[3]; - for (int i = 4; i < input_size; i += 4) { - tmp0 += input[i] * weight[i]; - tmp1 += input[i + 1] * weight[i + 1]; - tmp2 += input[i + 2] * weight[i + 2]; - tmp3 += input[i + 3] * weight[i + 3]; - } - out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); - pos_res++; - weight += input_size; - } - } - - if (transpose) { - for (int y = 0; y < red_pred_size; y++) { - for (int x = 0; x < red_pred_size; x++) { - output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y]; - } - } - } -} - - -void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* const boundary, - const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim, - const uint16_t src_step, const uint16_t src_stride, - const uint16_t dst_step, const uint16_t dst_stride, - const uint16_t boundary_step, - const uint16_t ups_factor) -{ - const int log2_factor = uvg_math_floor_log2(ups_factor); - assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); - const int rounding_offset = 1 << (log2_factor - 1); - - uint16_t idx_orth_dim = 0; - const int* src_line = src; - int* dst_line = dst; - const int* boundary_line = boundary + boundary_step - 1; - while (idx_orth_dim < src_size_orth_dim) - { - uint16_t idx_upsample_dim = 0; - const int* before = boundary_line; - const int* behind = src_line; - int* cur_dst = dst_line; - while (idx_upsample_dim < src_size_ups_dim) - { - uint16_t pos = 1; - int scaled_before = (*before) << log2_factor; - int scaled_behind = 0; - while (pos <= ups_factor) - { - scaled_before -= *before; - scaled_behind += *behind; - *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; - - pos++; - cur_dst += dst_step; - } - - idx_upsample_dim++; - before = behind; - behind += src_step; - } - - idx_orth_dim++; - src_line += src_stride; - dst_line += dst_stride; - boundary_line += boundary_step; - } -} - - - -/** \brief Matrix weighted intra prediction. -*/ -static void mip_predict( - const encoder_state_t* const state, - const uvg_intra_references* const refs, - const uint16_t pred_block_width, - const uint16_t pred_block_height, - uvg_pixel* dst, - const int mip_mode, - const bool mip_transp) -{ - // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative - - uvg_pixel* out = dst; - int result[32*32] = {0}; - const int mode_idx = mip_mode; - - // *** INPUT PREP *** - - // Initialize prediction parameters START - uint16_t width = pred_block_width; - uint16_t height = pred_block_height; - - int size_id; // Prediction block type - if (width == 4 && height == 4) { - size_id = 0; - } - else if (width == 4 || height == 4 || (width == 8 && height == 8)) { - size_id = 1; - } - else { - size_id = 2; - } - - // Reduced boundary and prediction sizes - int red_bdry_size = (size_id == 0) ? 2 : 4; - int red_pred_size = (size_id < 2) ? 4 : 8; - - // Upsampling factors - uint16_t ups_hor_factor = width / red_pred_size; - uint16_t ups_ver_factor = height / red_pred_size; - - // Upsampling factors must be powers of two - assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two."); - assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two."); - - // Initialize prediction parameters END - - int ref_samples_top[INTRA_REF_LENGTH]; - int ref_samples_left[INTRA_REF_LENGTH]; - - for (int i = 1; i < INTRA_REF_LENGTH; i++) { - ref_samples_top[i-1] = (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init - ref_samples_left[i-1] = (int)refs->ref.left[i]; - } - - // Compute reduced boundary with Haar-downsampling - const int input_size = 2 * red_bdry_size; - - int red_bdry[MIP_MAX_INPUT_SIZE]; - int red_bdry_trans[MIP_MAX_INPUT_SIZE]; - - int* const top_reduced = &red_bdry[0]; - int* const left_reduced = &red_bdry[red_bdry_size]; - - uvg_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size); - uvg_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size); - - // Transposed reduced boundaries - int* const left_reduced_trans = &red_bdry_trans[0]; - int* const top_reduced_trans = &red_bdry_trans[red_bdry_size]; - - for (int x = 0; x < red_bdry_size; x++) { - top_reduced_trans[x] = top_reduced[x]; - } - for (int y = 0; y < red_bdry_size; y++) { - left_reduced_trans[y] = left_reduced[y]; - } - - int input_offset = red_bdry[0]; - int input_offset_trans = red_bdry_trans[0]; - - const bool has_first_col = (size_id < 2); - // First column of matrix not needed for large blocks - red_bdry[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0; - red_bdry_trans[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0; - - for (int i = 1; i < input_size; ++i) { - red_bdry[i] -= input_offset; - red_bdry_trans[i] -= input_offset_trans; - } - - // *** INPUT PREP *** END - - // *** BLOCK PREDICT *** - - const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1); - const bool transpose = mip_transp; - - const uint8_t* matrix; - switch (size_id) { - case 0: - matrix = &uvg_mip_matrix_4x4[mode_idx][0][0]; - break; - case 1: - matrix = &uvg_mip_matrix_8x8[mode_idx][0][0]; - break; - case 2: - matrix = &uvg_mip_matrix_16x16[mode_idx][0][0]; - break; - default: - assert(false && "Invalid MIP size id."); - } - - // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8 - int red_pred_buffer[8*8]; - int* const reduced_pred = need_upsampling ? red_pred_buffer : result; - - const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry; - - uvg_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); - if (need_upsampling) { - const int* ver_src = reduced_pred; - uint16_t ver_src_step = width; - - if (ups_hor_factor > 1) { - int* const hor_dst = result + (ups_ver_factor - 1) * width; - ver_src = hor_dst; - ver_src_step *= ups_ver_factor; - - uvg_mip_pred_upsampling_1D(hor_dst, reduced_pred, ref_samples_left, - red_pred_size, red_pred_size, - 1, red_pred_size, 1, ver_src_step, - ups_ver_factor, ups_hor_factor); - } - - if (ups_ver_factor > 1) { - uvg_mip_pred_upsampling_1D(result, ver_src, ref_samples_top, - red_pred_size, width, - ver_src_step, 1, width, 1, - 1, ups_ver_factor); - } - } - - // Assign and cast values from temp array to output - for (int i = 0; i < 32 * 32; i++) { - out[i] = (uvg_pixel)result[i]; - } - // *** BLOCK PREDICT *** END -} - - int8_t uvg_wide_angle_correction( int_fast8_t mode, const int log2_width, @@ -1617,7 +1313,7 @@ void uvg_intra_predict( if (intra_mode < 68) { if (use_mip) { assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]"); - mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed); + uvg_mip_predict(refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed); } else { intra_predict_regular(state, refs, &data->pred_cu, cu_loc, pu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode); diff --git a/src/mip_data.h b/src/mip_data.h index 536db0a5..b3222bb7 100644 --- a/src/mip_data.h +++ b/src/mip_data.h @@ -43,9 +43,7 @@ #define MIP_SHIFT_MATRIX 6 #define MIP_OFFSET_MATRIX 32 -// NOTE: these matrices need to be aligned if used with avx2 -const uint8_t uvg_mip_matrix_4x4[16][16][4] = -{ +ALIGNED(32) static const uint8_t uvg_mip_matrix_4x4[16][16][4] = { { { 32, 30, 90, 28}, { 32, 32, 72, 28}, @@ -336,7 +334,7 @@ const uint8_t uvg_mip_matrix_4x4[16][16][4] = } }; -const uint8_t uvg_mip_matrix_8x8[8][16][8] = +ALIGNED(32) static const uint8_t uvg_mip_matrix_8x8[8][16][8] = { { { 30, 63, 46, 37, 25, 33, 33, 34}, @@ -484,7 +482,7 @@ const uint8_t uvg_mip_matrix_8x8[8][16][8] = } }; -const uint8_t uvg_mip_matrix_16x16[6][64][7] = +ALIGNED(32) static const uint8_t uvg_mip_matrix_16x16[6][64][7] = { { { 42, 37, 33, 27, 44, 33, 35}, diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 4fadae6f..49f3f3ac 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -32,6 +32,7 @@ #include "strategies/avx2/intra-avx2.h" + #if COMPILE_INTEL_AVX2 && defined X86_64 #include "uvg266.h" #include "cu.h" @@ -46,6 +47,8 @@ #include "global.h" #include "intra-avx2.h" #include "intra_avx2_tables.h" +#include "mip_data.h" +#include "uvg_math.h" #include "strategyselector.h" #include "strategies/missing-intel-intrinsics.h" @@ -4532,11 +4535,302 @@ static void uvg_pdpc_planar_dc_avx2( } } +void uvg_mip_boundary_downsampling_1D_avx2(int* reduced_dst, const int* const ref_src, int src_len, int dst_len) +{ + if (dst_len < src_len) + { + // Create reduced boundary by downsampling + uint16_t down_smp_factor = src_len / dst_len; + const int log2_factor = uvg_math_floor_log2(down_smp_factor); + const int rounding_offset = (1 << (log2_factor - 1)); + + uint16_t src_idx = 0; + for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++) + { + int sum = 0; + for (int k = 0; k < down_smp_factor; k++) + { + sum += ref_src[src_idx++]; + } + reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor; + } + } + else + { + // Copy boundary if no downsampling is needed + for (uint16_t i = 0; i < dst_len; ++i) + { + reduced_dst[i] = ref_src[i]; + } + } +} + + +void uvg_mip_reduced_pred_avx2(int* const output, + const int* const input, + const uint8_t* matrix, + const bool transpose, + const int red_bdry_size, + const int red_pred_size, + const int size_id, + const int in_offset, + const int in_offset_tr) +{ + const int input_size = 2 * red_bdry_size; + + // Use local buffer for transposed result + int out_buf_transposed[LCU_WIDTH * LCU_WIDTH]; + int* const out_ptr = transpose ? out_buf_transposed : output; + + int sum = 0; + for (int i = 0; i < input_size; i++) { + sum += input[i]; + } + const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; + assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four"); + + const uint8_t* weight = matrix; + const int input_offset = transpose ? in_offset_tr : in_offset; + + const bool red_size = (size_id == 2); + int pos_res = 0; + for (int y = 0; y < red_pred_size; y++) { + for (int x = 0; x < red_pred_size; x++) { + if (red_size) { + weight -= 1; + } + int tmp0 = red_size ? 0 : (input[0] * weight[0]); + int tmp1 = input[1] * weight[1]; + int tmp2 = input[2] * weight[2]; + int tmp3 = input[3] * weight[3]; + for (int i = 4; i < input_size; i += 4) { + tmp0 += input[i] * weight[i]; + tmp1 += input[i + 1] * weight[i + 1]; + tmp2 += input[i + 2] * weight[i + 2]; + tmp3 += input[i + 3] * weight[i + 3]; + } + out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); + pos_res++; + weight += input_size; + } + } + + if (transpose) { + for (int y = 0; y < red_pred_size; y++) { + for (int x = 0; x < red_pred_size; x++) { + output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y]; + } + } + } +} + + +void uvg_mip_pred_upsampling_1D_avx2(int* const dst, const int* const src, const int* const boundary, + const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim, + const uint16_t src_step, const uint16_t src_stride, + const uint16_t dst_step, const uint16_t dst_stride, + const uint16_t boundary_step, + const uint16_t ups_factor) +{ + const int log2_factor = uvg_math_floor_log2(ups_factor); + assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); + const int rounding_offset = 1 << (log2_factor - 1); + + uint16_t idx_orth_dim = 0; + const int* src_line = src; + int* dst_line = dst; + const int* boundary_line = boundary + boundary_step - 1; + while (idx_orth_dim < src_size_orth_dim) + { + uint16_t idx_upsample_dim = 0; + const int* before = boundary_line; + const int* behind = src_line; + int* cur_dst = dst_line; + while (idx_upsample_dim < src_size_ups_dim) + { + uint16_t pos = 1; + int scaled_before = (*before) << log2_factor; + int scaled_behind = 0; + while (pos <= ups_factor) + { + scaled_before -= *before; + scaled_behind += *behind; + *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; + + pos++; + cur_dst += dst_step; + } + + idx_upsample_dim++; + before = behind; + behind += src_step; + } + + idx_orth_dim++; + src_line += src_stride; + dst_line += dst_stride; + boundary_line += boundary_step; + } +} + + + +/** \brief Matrix weighted intra prediction. +*/ +static void mip_predict_avx2( + //const encoder_state_t* const state, + const uvg_intra_references* const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + uvg_pixel* dst, + const int mip_mode, + const bool mip_transp) +{ + // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative + + uvg_pixel* out = dst; + int result[32 * 32] = { 0 }; + const int mode_idx = mip_mode; + + // *** INPUT PREP *** + + // Initialize prediction parameters START + uint16_t width = pred_block_width; + uint16_t height = pred_block_height; + + int size_id; // Prediction block type + if (width == 4 && height == 4) { + size_id = 0; + } + else if (width == 4 || height == 4 || (width == 8 && height == 8)) { + size_id = 1; + } + else { + size_id = 2; + } + + // Reduced boundary and prediction sizes + int red_bdry_size = (size_id == 0) ? 2 : 4; + int red_pred_size = (size_id < 2) ? 4 : 8; + + // Upsampling factors + uint16_t ups_hor_factor = width / red_pred_size; + uint16_t ups_ver_factor = height / red_pred_size; + + // Upsampling factors must be powers of two + assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two."); + assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two."); + + // Initialize prediction parameters END + + int ref_samples_top[INTRA_REF_LENGTH]; + int ref_samples_left[INTRA_REF_LENGTH]; + + for (int i = 1; i < INTRA_REF_LENGTH; i++) { + ref_samples_top[i - 1] = (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init + ref_samples_left[i - 1] = (int)refs->ref.left[i]; + } + + // Compute reduced boundary with Haar-downsampling + const int input_size = 2 * red_bdry_size; + + int red_bdry[MIP_MAX_INPUT_SIZE]; + int red_bdry_trans[MIP_MAX_INPUT_SIZE]; + + int* const top_reduced = &red_bdry[0]; + int* const left_reduced = &red_bdry[red_bdry_size]; + + uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); + uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); + + // Transposed reduced boundaries + int* const left_reduced_trans = &red_bdry_trans[0]; + int* const top_reduced_trans = &red_bdry_trans[red_bdry_size]; + + for (int x = 0; x < red_bdry_size; x++) { + top_reduced_trans[x] = top_reduced[x]; + } + for (int y = 0; y < red_bdry_size; y++) { + left_reduced_trans[y] = left_reduced[y]; + } + + int input_offset = red_bdry[0]; + int input_offset_trans = red_bdry_trans[0]; + + const bool has_first_col = (size_id < 2); + // First column of matrix not needed for large blocks + red_bdry[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0; + red_bdry_trans[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0; + + for (int i = 1; i < input_size; ++i) { + red_bdry[i] -= input_offset; + red_bdry_trans[i] -= input_offset_trans; + } + + // *** INPUT PREP *** END + + // *** BLOCK PREDICT *** + + const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1); + const bool transpose = mip_transp; + + const uint8_t* matrix; + switch (size_id) { + case 0: + matrix = &uvg_mip_matrix_4x4[mode_idx][0][0]; + break; + case 1: + matrix = &uvg_mip_matrix_8x8[mode_idx][0][0]; + break; + case 2: + matrix = &uvg_mip_matrix_16x16[mode_idx][0][0]; + break; + default: + assert(false && "Invalid MIP size id."); + } + + // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8 + int red_pred_buffer[8 * 8]; + int* const reduced_pred = need_upsampling ? red_pred_buffer : result; + + const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry; + + uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); + if (need_upsampling) { + const int* ver_src = reduced_pred; + uint16_t ver_src_step = width; + + if (ups_hor_factor > 1) { + int* const hor_dst = result + (ups_ver_factor - 1) * width; + ver_src = hor_dst; + ver_src_step *= ups_ver_factor; + + uvg_mip_pred_upsampling_1D_avx2(hor_dst, reduced_pred, ref_samples_left, + red_pred_size, red_pred_size, + 1, red_pred_size, 1, ver_src_step, + ups_ver_factor, ups_hor_factor); + } + + if (ups_ver_factor > 1) { + uvg_mip_pred_upsampling_1D_avx2(result, ver_src, ref_samples_top, + red_pred_size, width, + ver_src_step, 1, width, 1, + 1, ups_ver_factor); + } + } + + // Assign and cast values from temp array to output + for (int i = 0; i < 32 * 32; i++) { + out[i] = (uvg_pixel)result[i]; + } + // *** BLOCK PREDICT *** END +} + + #endif // UVG_BIT_DEPTH == 8 #endif // COMPILE_INTEL_AVX2 && defined X86_64 - int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) { bool success = true; @@ -4547,6 +4841,7 @@ int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2); success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2); success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2); + success &= uvg_strategyselector_register(opaque, "mip_predict", "avx2", 40, &mip_predict_avx2); } #endif //UVG_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 && defined X86_64 diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c index 398388fc..c00a6dfa 100644 --- a/src/strategies/generic/intra-generic.c +++ b/src/strategies/generic/intra-generic.c @@ -36,6 +36,7 @@ #include "cu.h" #include "intra.h" +#include "mip_data.h" #include "uvg266.h" #include "strategyselector.h" #include "uvg_math.h" @@ -458,6 +459,299 @@ static void uvg_pdpc_planar_dc_generic( } + +void uvg_mip_boundary_downsampling_1D(int* reduced_dst, const int* const ref_src, int src_len, int dst_len) +{ + if (dst_len < src_len) + { + // Create reduced boundary by downsampling + uint16_t down_smp_factor = src_len / dst_len; + const int log2_factor = uvg_math_floor_log2(down_smp_factor); + const int rounding_offset = (1 << (log2_factor - 1)); + + uint16_t src_idx = 0; + for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++) + { + int sum = 0; + for (int k = 0; k < down_smp_factor; k++) + { + sum += ref_src[src_idx++]; + } + reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor; + } + } + else + { + // Copy boundary if no downsampling is needed + for (uint16_t i = 0; i < dst_len; ++i) + { + reduced_dst[i] = ref_src[i]; + } + } +} + + +void uvg_mip_reduced_pred(int* const output, + const int* const input, + const uint8_t* matrix, + const bool transpose, + const int red_bdry_size, + const int red_pred_size, + const int size_id, + const int in_offset, + const int in_offset_tr) +{ + const int input_size = 2 * red_bdry_size; + + // Use local buffer for transposed result + int out_buf_transposed[LCU_WIDTH * LCU_WIDTH]; + int* const out_ptr = transpose ? out_buf_transposed : output; + + int sum = 0; + for (int i = 0; i < input_size; i++) { + sum += input[i]; + } + const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; + assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four"); + + const uint8_t* weight = matrix; + const int input_offset = transpose ? in_offset_tr : in_offset; + + const bool red_size = (size_id == 2); + int pos_res = 0; + for (int y = 0; y < red_pred_size; y++) { + for (int x = 0; x < red_pred_size; x++) { + if (red_size) { + weight -= 1; + } + int tmp0 = red_size ? 0 : (input[0] * weight[0]); + int tmp1 = input[1] * weight[1]; + int tmp2 = input[2] * weight[2]; + int tmp3 = input[3] * weight[3]; + for (int i = 4; i < input_size; i += 4) { + tmp0 += input[i] * weight[i]; + tmp1 += input[i + 1] * weight[i + 1]; + tmp2 += input[i + 2] * weight[i + 2]; + tmp3 += input[i + 3] * weight[i + 3]; + } + out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); + pos_res++; + weight += input_size; + } + } + + if (transpose) { + for (int y = 0; y < red_pred_size; y++) { + for (int x = 0; x < red_pred_size; x++) { + output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y]; + } + } + } +} + + +void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* const boundary, + const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim, + const uint16_t src_step, const uint16_t src_stride, + const uint16_t dst_step, const uint16_t dst_stride, + const uint16_t boundary_step, + const uint16_t ups_factor) +{ + const int log2_factor = uvg_math_floor_log2(ups_factor); + assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); + const int rounding_offset = 1 << (log2_factor - 1); + + uint16_t idx_orth_dim = 0; + const int* src_line = src; + int* dst_line = dst; + const int* boundary_line = boundary + boundary_step - 1; + while (idx_orth_dim < src_size_orth_dim) + { + uint16_t idx_upsample_dim = 0; + const int* before = boundary_line; + const int* behind = src_line; + int* cur_dst = dst_line; + while (idx_upsample_dim < src_size_ups_dim) + { + uint16_t pos = 1; + int scaled_before = (*before) << log2_factor; + int scaled_behind = 0; + while (pos <= ups_factor) + { + scaled_before -= *before; + scaled_behind += *behind; + *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; + + pos++; + cur_dst += dst_step; + } + + idx_upsample_dim++; + before = behind; + behind += src_step; + } + + idx_orth_dim++; + src_line += src_stride; + dst_line += dst_stride; + boundary_line += boundary_step; + } +} + + + +/** \brief Matrix weighted intra prediction. +*/ +static void mip_predict_generic( + const uvg_intra_references* const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + uvg_pixel* dst, + const int mip_mode, + const bool mip_transp) +{ + // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative + + uvg_pixel* out = dst; + int result[32*32] = {0}; + const int mode_idx = mip_mode; + + // *** INPUT PREP *** + + // Initialize prediction parameters START + uint16_t width = pred_block_width; + uint16_t height = pred_block_height; + + int size_id; // Prediction block type + if (width == 4 && height == 4) { + size_id = 0; + } + else if (width == 4 || height == 4 || (width == 8 && height == 8)) { + size_id = 1; + } + else { + size_id = 2; + } + + // Reduced boundary and prediction sizes + int red_bdry_size = (size_id == 0) ? 2 : 4; + int red_pred_size = (size_id < 2) ? 4 : 8; + + // Upsampling factors + uint16_t ups_hor_factor = width / red_pred_size; + uint16_t ups_ver_factor = height / red_pred_size; + + // Upsampling factors must be powers of two + assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two."); + assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two."); + + // Initialize prediction parameters END + + int ref_samples_top[INTRA_REF_LENGTH]; + int ref_samples_left[INTRA_REF_LENGTH]; + + for (int i = 1; i < INTRA_REF_LENGTH; i++) { + ref_samples_top[i-1] = (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init + ref_samples_left[i-1] = (int)refs->ref.left[i]; + } + + // Compute reduced boundary with Haar-downsampling + const int input_size = 2 * red_bdry_size; + + int red_bdry[MIP_MAX_INPUT_SIZE]; + int red_bdry_trans[MIP_MAX_INPUT_SIZE]; + + int* const top_reduced = &red_bdry[0]; + int* const left_reduced = &red_bdry[red_bdry_size]; + + uvg_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size); + uvg_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size); + + // Transposed reduced boundaries + int* const left_reduced_trans = &red_bdry_trans[0]; + int* const top_reduced_trans = &red_bdry_trans[red_bdry_size]; + + for (int x = 0; x < red_bdry_size; x++) { + top_reduced_trans[x] = top_reduced[x]; + } + for (int y = 0; y < red_bdry_size; y++) { + left_reduced_trans[y] = left_reduced[y]; + } + + int input_offset = red_bdry[0]; + int input_offset_trans = red_bdry_trans[0]; + + const bool has_first_col = (size_id < 2); + // First column of matrix not needed for large blocks + red_bdry[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0; + red_bdry_trans[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0; + + for (int i = 1; i < input_size; ++i) { + red_bdry[i] -= input_offset; + red_bdry_trans[i] -= input_offset_trans; + } + + // *** INPUT PREP *** END + + // *** BLOCK PREDICT *** + + const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1); + const bool transpose = mip_transp; + + const uint8_t* matrix; + switch (size_id) { + case 0: + matrix = &uvg_mip_matrix_4x4[mode_idx][0][0]; + break; + case 1: + matrix = &uvg_mip_matrix_8x8[mode_idx][0][0]; + break; + case 2: + matrix = &uvg_mip_matrix_16x16[mode_idx][0][0]; + break; + default: + assert(false && "Invalid MIP size id."); + } + + // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8 + int red_pred_buffer[8*8]; + int* const reduced_pred = need_upsampling ? red_pred_buffer : result; + + const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry; + + uvg_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); + if (need_upsampling) { + const int* ver_src = reduced_pred; + uint16_t ver_src_step = width; + + if (ups_hor_factor > 1) { + int* const hor_dst = result + (ups_ver_factor - 1) * width; + ver_src = hor_dst; + ver_src_step *= ups_ver_factor; + + uvg_mip_pred_upsampling_1D(hor_dst, reduced_pred, ref_samples_left, + red_pred_size, red_pred_size, + 1, red_pred_size, 1, ver_src_step, + ups_ver_factor, ups_hor_factor); + } + + if (ups_ver_factor > 1) { + uvg_mip_pred_upsampling_1D(result, ver_src, ref_samples_top, + red_pred_size, width, + ver_src_step, 1, width, 1, + 1, ups_ver_factor); + } + } + + // Assign and cast values from temp array to output + for (int i = 0; i < 32 * 32; i++) { + out[i] = (uvg_pixel)result[i]; + } + // *** BLOCK PREDICT *** END +} + + + int uvg_strategy_register_intra_generic(void* opaque, uint8_t bitdepth) { bool success = true; @@ -466,6 +760,7 @@ int uvg_strategy_register_intra_generic(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "generic", 0, &uvg_intra_pred_planar_generic); success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "generic", 0, &uvg_intra_pred_filtered_dc_generic); success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "generic", 0, &uvg_pdpc_planar_dc_generic); + success &= uvg_strategyselector_register(opaque, "mip_predict", "generic", 0, &mip_predict_generic); return success; } diff --git a/src/strategies/strategies-intra.c b/src/strategies/strategies-intra.c index d12b37f7..e1f82d92 100644 --- a/src/strategies/strategies-intra.c +++ b/src/strategies/strategies-intra.c @@ -42,6 +42,7 @@ angular_pred_func *uvg_angular_pred; intra_pred_planar_func *uvg_intra_pred_planar; intra_pred_filtered_dc_func *uvg_intra_pred_filtered_dc; pdpc_planar_dc_func *uvg_pdpc_planar_dc; +mip_pred_func *uvg_mip_predict; int uvg_strategy_register_intra(void* opaque, uint8_t bitdepth) { bool success = true; diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h index 52f5e519..eeec4b09 100644 --- a/src/strategies/strategies-intra.h +++ b/src/strategies/strategies-intra.h @@ -76,11 +76,20 @@ typedef void (pdpc_planar_dc_func)( const uvg_intra_ref *const used_ref, uvg_pixel *const dst); +typedef void(mip_pred_func)( + const uvg_intra_references * const refs, + const uint16_t pred_block_width, + const uint16_t pred_block_height, + uvg_pixel *dst, + const int mip_mode, + const bool mip_transp); + // Declare function pointers. extern angular_pred_func * uvg_angular_pred; extern intra_pred_planar_func * uvg_intra_pred_planar; extern intra_pred_filtered_dc_func * uvg_intra_pred_filtered_dc; extern pdpc_planar_dc_func * uvg_pdpc_planar_dc; +extern mip_pred_func *uvg_mip_predict; int uvg_strategy_register_intra(void* opaque, uint8_t bitdepth); @@ -90,6 +99,7 @@ int uvg_strategy_register_intra(void* opaque, uint8_t bitdepth); {"intra_pred_planar", (void**) &uvg_intra_pred_planar}, \ {"intra_pred_filtered_dc", (void**) &uvg_intra_pred_filtered_dc}, \ {"pdpc_planar_dc", (void**) &uvg_pdpc_planar_dc}, \ + {"mip_predict", (void**) &uvg_mip_predict}, From 357d14010c12399729c01b219b53d46e4fa8b1a1 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 13 Mar 2024 16:43:28 +0200 Subject: [PATCH 107/237] Add some comments on optimizing the downsampling function, or whether if it is necessary at all. --- src/strategies/avx2/intra-avx2.c | 34 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 49f3f3ac..d8781cdb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4537,31 +4537,42 @@ static void uvg_pdpc_planar_dc_avx2( void uvg_mip_boundary_downsampling_1D_avx2(int* reduced_dst, const int* const ref_src, int src_len, int dst_len) { + // Source length can be 4, 8, 16, 32 or 64 + // Destination length can be 2 or 4 + + // Due to the small size of dst_len, not much can be done with AVX2 here + if (dst_len < src_len) { // Create reduced boundary by downsampling + // Maximum down sample factor is 64 / 2 = 32 uint16_t down_smp_factor = src_len / dst_len; const int log2_factor = uvg_math_floor_log2(down_smp_factor); const int rounding_offset = (1 << (log2_factor - 1)); uint16_t src_idx = 0; - for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++) - { + // This loop is run max 4 times + for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++) { int sum = 0; - for (int k = 0; k < down_smp_factor; k++) - { + // Sum together up tp 32 sequential source samples + for (int k = 0; k < down_smp_factor; k++) { sum += ref_src[src_idx++]; } reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor; + // I can only see limited optimization potential here. There's a lot of additions, but not too much data. + // For down sample factor 2, a simple horizontal add would do wonders, but it can only handle that specific case. + // There needs to be several versions of this function for different cases, not entirely sure if its worth it. } } else { - // Copy boundary if no downsampling is needed - for (uint16_t i = 0; i < dst_len; ++i) + // Copy boundary if no downsampling is needed. If this branch is reached, dst_len must be 4 + memcpy(reduced_dst, ref_src, 4 * sizeof(int)); // Copy as much as dst_len indicates + + /*for (uint16_t i = 0; i < dst_len; ++i) { reduced_dst[i] = ref_src[i]; - } + }*/ } } @@ -4576,7 +4587,8 @@ void uvg_mip_reduced_pred_avx2(int* const output, const int in_offset, const int in_offset_tr) { - const int input_size = 2 * red_bdry_size; + // Reduced boundary size is 2 or 4 -> input size is 4 or 8 + const int input_size = 2 * red_bdry_size; // Use local buffer for transposed result int out_buf_transposed[LCU_WIDTH * LCU_WIDTH]; @@ -4594,6 +4606,8 @@ void uvg_mip_reduced_pred_avx2(int* const output, const bool red_size = (size_id == 2); int pos_res = 0; + + // Reduced prediction size is 4 or 8 for (int y = 0; y < red_pred_size; y++) { for (int x = 0; x < red_pred_size; x++) { if (red_size) { @@ -4677,7 +4691,7 @@ void uvg_mip_pred_upsampling_1D_avx2(int* const dst, const int* const src, const /** \brief Matrix weighted intra prediction. */ -static void mip_predict_avx2( +void mip_predict_avx2( //const encoder_state_t* const state, const uvg_intra_references* const refs, const uint16_t pred_block_width, @@ -4774,7 +4788,7 @@ static void mip_predict_avx2( const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1); const bool transpose = mip_transp; - const uint8_t* matrix; + const uint8_t* matrix = 0; switch (size_id) { case 0: matrix = &uvg_mip_matrix_4x4[mode_idx][0][0]; From 5ebadeea854c5fb46ffe42a2d7f13f763056e787 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 14 Mar 2024 21:31:34 +0200 Subject: [PATCH 108/237] Change most mip code to work with uvg_pixel. Retain 16-bit parts where needed. --- src/strategies/avx2/intra-avx2.c | 88 ++++++++++++++++---------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d8781cdb..d181c959 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4535,7 +4535,7 @@ static void uvg_pdpc_planar_dc_avx2( } } -void uvg_mip_boundary_downsampling_1D_avx2(int* reduced_dst, const int* const ref_src, int src_len, int dst_len) +void uvg_mip_boundary_downsampling_1D_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src, int src_len, int dst_len) { // Source length can be 4, 8, 16, 32 or 64 // Destination length can be 2 or 4 @@ -4577,8 +4577,8 @@ void uvg_mip_boundary_downsampling_1D_avx2(int* reduced_dst, const int* const re } -void uvg_mip_reduced_pred_avx2(int* const output, - const int* const input, +void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, + const int16_t* const input, const uint8_t* matrix, const bool transpose, const int red_bdry_size, @@ -4591,8 +4591,8 @@ void uvg_mip_reduced_pred_avx2(int* const output, const int input_size = 2 * red_bdry_size; // Use local buffer for transposed result - int out_buf_transposed[LCU_WIDTH * LCU_WIDTH]; - int* const out_ptr = transpose ? out_buf_transposed : output; + uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH + uvg_pixel* const out_ptr = transpose ? out_buf_transposed : output; int sum = 0; for (int i = 0; i < input_size; i++) { @@ -4613,6 +4613,7 @@ void uvg_mip_reduced_pred_avx2(int* const output, if (red_size) { weight -= 1; } + // Use 16-bit intermediates int tmp0 = red_size ? 0 : (input[0] * weight[0]); int tmp1 = input[1] * weight[1]; int tmp2 = input[2] * weight[2]; @@ -4639,27 +4640,27 @@ void uvg_mip_reduced_pred_avx2(int* const output, } -void uvg_mip_pred_upsampling_1D_avx2(int* const dst, const int* const src, const int* const boundary, - const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim, - const uint16_t src_step, const uint16_t src_stride, - const uint16_t dst_step, const uint16_t dst_stride, - const uint16_t boundary_step, - const uint16_t ups_factor) +void uvg_mip_pred_upsampling_1D_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, + const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, + const uint8_t src_step, const uint8_t src_stride, + const uint8_t dst_step, const uint8_t dst_stride, + const uint8_t boundary_step, + const uint8_t ups_factor) { const int log2_factor = uvg_math_floor_log2(ups_factor); assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); const int rounding_offset = 1 << (log2_factor - 1); uint16_t idx_orth_dim = 0; - const int* src_line = src; - int* dst_line = dst; - const int* boundary_line = boundary + boundary_step - 1; + const uvg_pixel* src_line = src; + uvg_pixel* dst_line = dst; + const uvg_pixel* boundary_line = boundary + boundary_step - 1; while (idx_orth_dim < src_size_orth_dim) { uint16_t idx_upsample_dim = 0; - const int* before = boundary_line; - const int* behind = src_line; - int* cur_dst = dst_line; + const uvg_pixel* before = boundary_line; + const uvg_pixel* behind = src_line; + uvg_pixel* cur_dst = dst_line; while (idx_upsample_dim < src_size_ups_dim) { uint16_t pos = 1; @@ -4703,7 +4704,7 @@ void mip_predict_avx2( // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative uvg_pixel* out = dst; - int result[32 * 32] = { 0 }; + uvg_pixel result[32 * 32] = { 0 }; const int mode_idx = mip_mode; // *** INPUT PREP *** @@ -4737,29 +4738,27 @@ void mip_predict_avx2( // Initialize prediction parameters END - int ref_samples_top[INTRA_REF_LENGTH]; - int ref_samples_left[INTRA_REF_LENGTH]; - - for (int i = 1; i < INTRA_REF_LENGTH; i++) { - ref_samples_top[i - 1] = (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init - ref_samples_left[i - 1] = (int)refs->ref.left[i]; - } + const uvg_pixel* ref_samples_top = &refs->ref.top[1]; + const uvg_pixel* ref_samples_left = &refs->ref.left[1]; // Compute reduced boundary with Haar-downsampling const int input_size = 2 * red_bdry_size; - int red_bdry[MIP_MAX_INPUT_SIZE]; - int red_bdry_trans[MIP_MAX_INPUT_SIZE]; + uvg_pixel red_bdry[MIP_MAX_INPUT_SIZE]; + uvg_pixel red_bdry_trans[MIP_MAX_INPUT_SIZE]; + int16_t red_bdry16[MIP_MAX_INPUT_SIZE]; + int16_t red_bdry_trans16[MIP_MAX_INPUT_SIZE]; - int* const top_reduced = &red_bdry[0]; - int* const left_reduced = &red_bdry[red_bdry_size]; + uvg_pixel* const top_reduced = &red_bdry[0]; + uvg_pixel* const left_reduced = &red_bdry[red_bdry_size]; + // These work fine with uvg_pixel uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); // Transposed reduced boundaries - int* const left_reduced_trans = &red_bdry_trans[0]; - int* const top_reduced_trans = &red_bdry_trans[red_bdry_size]; + uvg_pixel* const left_reduced_trans = &red_bdry_trans[0]; + uvg_pixel* const top_reduced_trans = &red_bdry_trans[red_bdry_size]; for (int x = 0; x < red_bdry_size; x++) { top_reduced_trans[x] = top_reduced[x]; @@ -4768,17 +4767,19 @@ void mip_predict_avx2( left_reduced_trans[y] = left_reduced[y]; } - int input_offset = red_bdry[0]; - int input_offset_trans = red_bdry_trans[0]; + uvg_pixel input_offset = red_bdry[0]; + uvg_pixel input_offset_trans = red_bdry_trans[0]; const bool has_first_col = (size_id < 2); // First column of matrix not needed for large blocks - red_bdry[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0; - red_bdry_trans[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0; + // These can potentially fail with uvg_pixel + red_bdry16[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset) : 0; + red_bdry_trans16[0] = has_first_col ? ((1 << (UVG_BIT_DEPTH - 1)) - input_offset_trans) : 0; + // This fails with uvg_pixel, here at least int16_t is needed for (int i = 1; i < input_size; ++i) { - red_bdry[i] -= input_offset; - red_bdry_trans[i] -= input_offset_trans; + red_bdry16[i] = red_bdry[i] - input_offset; + red_bdry_trans16[i] = red_bdry_trans[i] - input_offset_trans; } // *** INPUT PREP *** END @@ -4804,18 +4805,19 @@ void mip_predict_avx2( } // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8 - int red_pred_buffer[8 * 8]; - int* const reduced_pred = need_upsampling ? red_pred_buffer : result; + uvg_pixel red_pred_buffer[8 * 8]; + uvg_pixel* const reduced_pred = need_upsampling ? red_pred_buffer : result; - const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry; + const uvg_pixel* const reduced_bdry = transpose ? red_bdry_trans : red_bdry; + const int16_t* const reduced_bdry16 = transpose ? red_bdry_trans16 : red_bdry16; - uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); + uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); if (need_upsampling) { - const int* ver_src = reduced_pred; + const uvg_pixel* ver_src = reduced_pred; uint16_t ver_src_step = width; if (ups_hor_factor > 1) { - int* const hor_dst = result + (ups_ver_factor - 1) * width; + uvg_pixel* const hor_dst = result + (ups_ver_factor - 1) * width; ver_src = hor_dst; ver_src_step *= ups_ver_factor; From cc80fc4e1001556cf6f2f877caf51732ac319e6b Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 18 Mar 2024 22:42:12 +0200 Subject: [PATCH 109/237] Add zero coeffs to size id 2 weight matrices. Do a separate mip predict for each size id. WIP on size id 2 function. --- src/mip_data.h | 772 ++++++++++++------------- src/strategies/avx2/intra-avx2.c | 71 ++- src/strategies/generic/intra-generic.c | 8 +- 3 files changed, 453 insertions(+), 398 deletions(-) diff --git a/src/mip_data.h b/src/mip_data.h index b3222bb7..a6549c34 100644 --- a/src/mip_data.h +++ b/src/mip_data.h @@ -482,402 +482,402 @@ ALIGNED(32) static const uint8_t uvg_mip_matrix_8x8[8][16][8] = } }; -ALIGNED(32) static const uint8_t uvg_mip_matrix_16x16[6][64][7] = +ALIGNED(32) static const uint8_t uvg_mip_matrix_16x16[6][64][8] = { { - { 42, 37, 33, 27, 44, 33, 35}, - { 71, 39, 34, 24, 36, 35, 36}, - { 77, 46, 35, 33, 30, 34, 36}, - { 64, 60, 35, 33, 31, 32, 36}, - { 49, 71, 38, 32, 32, 31, 36}, - { 42, 66, 50, 33, 31, 32, 36}, - { 40, 52, 67, 33, 31, 32, 35}, - { 38, 43, 75, 33, 32, 32, 35}, - { 56, 40, 33, 26, 43, 38, 36}, - { 70, 49, 34, 30, 28, 38, 38}, - { 65, 57, 36, 34, 28, 33, 39}, - { 59, 60, 39, 33, 30, 31, 38}, - { 55, 60, 43, 33, 30, 31, 38}, - { 51, 61, 47, 33, 30, 32, 37}, - { 46, 62, 51, 34, 30, 32, 37}, - { 42, 60, 55, 33, 31, 32, 37}, - { 60, 42, 34, 30, 37, 43, 38}, - { 68, 52, 35, 35, 22, 37, 40}, - { 62, 58, 37, 34, 28, 31, 40}, - { 58, 59, 41, 33, 30, 30, 39}, - { 56, 59, 44, 34, 30, 31, 38}, - { 53, 60, 45, 33, 30, 31, 38}, - { 49, 65, 45, 33, 30, 31, 38}, - { 45, 64, 47, 33, 31, 32, 38}, - { 59, 44, 35, 31, 34, 43, 41}, - { 66, 53, 36, 35, 25, 31, 43}, - { 61, 58, 38, 34, 29, 30, 40}, - { 59, 57, 41, 33, 30, 31, 39}, - { 57, 58, 43, 33, 30, 31, 39}, - { 54, 61, 43, 33, 31, 31, 39}, - { 51, 64, 43, 33, 31, 31, 39}, - { 48, 64, 45, 33, 32, 31, 39}, - { 57, 45, 35, 30, 35, 40, 44}, - { 65, 54, 37, 33, 33, 24, 44}, - { 63, 56, 38, 34, 30, 29, 39}, - { 61, 56, 41, 34, 30, 32, 39}, - { 58, 58, 42, 33, 31, 31, 39}, - { 54, 62, 41, 33, 31, 31, 39}, - { 51, 65, 42, 33, 31, 31, 39}, - { 48, 63, 43, 33, 32, 31, 39}, - { 55, 46, 35, 30, 36, 38, 47}, - { 65, 53, 37, 32, 36, 26, 40}, - { 65, 54, 38, 33, 31, 30, 38}, - { 63, 55, 39, 33, 30, 32, 38}, - { 59, 58, 40, 33, 31, 31, 39}, - { 54, 64, 40, 33, 31, 30, 40}, - { 49, 66, 40, 32, 32, 30, 41}, - { 48, 64, 42, 32, 32, 30, 41}, - { 54, 46, 35, 30, 34, 39, 49}, - { 64, 52, 36, 32, 34, 34, 35}, - { 65, 53, 37, 33, 32, 32, 37}, - { 63, 55, 38, 33, 31, 31, 39}, - { 59, 60, 38, 33, 31, 31, 40}, - { 54, 64, 38, 33, 32, 30, 40}, - { 49, 66, 39, 33, 32, 29, 41}, - { 47, 64, 42, 32, 33, 29, 42}, - { 51, 46, 35, 31, 33, 37, 54}, - { 61, 51, 36, 32, 33, 38, 36}, - { 63, 53, 37, 32, 32, 34, 37}, - { 62, 55, 37, 33, 32, 32, 39}, - { 58, 59, 37, 33, 32, 31, 40}, - { 53, 63, 38, 33, 32, 31, 40}, - { 49, 64, 40, 33, 33, 30, 41}, - { 46, 62, 42, 33, 33, 30, 42} - }, + { 0, 42, 37, 33, 27, 44, 33, 35}, + { 0, 71, 39, 34, 24, 36, 35, 36}, + { 0, 77, 46, 35, 33, 30, 34, 36}, + { 0, 64, 60, 35, 33, 31, 32, 36}, + { 0, 49, 71, 38, 32, 32, 31, 36}, + { 0, 42, 66, 50, 33, 31, 32, 36}, + { 0, 40, 52, 67, 33, 31, 32, 35}, + { 0, 38, 43, 75, 33, 32, 32, 35}, + { 0, 56, 40, 33, 26, 43, 38, 36}, + { 0, 70, 49, 34, 30, 28, 38, 38}, + { 0, 65, 57, 36, 34, 28, 33, 39}, + { 0, 59, 60, 39, 33, 30, 31, 38}, + { 0, 55, 60, 43, 33, 30, 31, 38}, + { 0, 51, 61, 47, 33, 30, 32, 37}, + { 0, 46, 62, 51, 34, 30, 32, 37}, + { 0, 42, 60, 55, 33, 31, 32, 37}, + { 0, 60, 42, 34, 30, 37, 43, 38}, + { 0, 68, 52, 35, 35, 22, 37, 40}, + { 0, 62, 58, 37, 34, 28, 31, 40}, + { 0, 58, 59, 41, 33, 30, 30, 39}, + { 0, 56, 59, 44, 34, 30, 31, 38}, + { 0, 53, 60, 45, 33, 30, 31, 38}, + { 0, 49, 65, 45, 33, 30, 31, 38}, + { 0, 45, 64, 47, 33, 31, 32, 38}, + { 0, 59, 44, 35, 31, 34, 43, 41}, + { 0, 66, 53, 36, 35, 25, 31, 43}, + { 0, 61, 58, 38, 34, 29, 30, 40}, + { 0, 59, 57, 41, 33, 30, 31, 39}, + { 0, 57, 58, 43, 33, 30, 31, 39}, + { 0, 54, 61, 43, 33, 31, 31, 39}, + { 0, 51, 64, 43, 33, 31, 31, 39}, + { 0, 48, 64, 45, 33, 32, 31, 39}, + { 0, 57, 45, 35, 30, 35, 40, 44}, + { 0, 65, 54, 37, 33, 33, 24, 44}, + { 0, 63, 56, 38, 34, 30, 29, 39}, + { 0, 61, 56, 41, 34, 30, 32, 39}, + { 0, 58, 58, 42, 33, 31, 31, 39}, + { 0, 54, 62, 41, 33, 31, 31, 39}, + { 0, 51, 65, 42, 33, 31, 31, 39}, + { 0, 48, 63, 43, 33, 32, 31, 39}, + { 0, 55, 46, 35, 30, 36, 38, 47}, + { 0, 65, 53, 37, 32, 36, 26, 40}, + { 0, 65, 54, 38, 33, 31, 30, 38}, + { 0, 63, 55, 39, 33, 30, 32, 38}, + { 0, 59, 58, 40, 33, 31, 31, 39}, + { 0, 54, 64, 40, 33, 31, 30, 40}, + { 0, 49, 66, 40, 32, 32, 30, 41}, + { 0, 48, 64, 42, 32, 32, 30, 41}, + { 0, 54, 46, 35, 30, 34, 39, 49}, + { 0, 64, 52, 36, 32, 34, 34, 35}, + { 0, 65, 53, 37, 33, 32, 32, 37}, + { 0, 63, 55, 38, 33, 31, 31, 39}, + { 0, 59, 60, 38, 33, 31, 31, 40}, + { 0, 54, 64, 38, 33, 32, 30, 40}, + { 0, 49, 66, 39, 33, 32, 29, 41}, + { 0, 47, 64, 42, 32, 33, 29, 42}, + { 0, 51, 46, 35, 31, 33, 37, 54}, + { 0, 61, 51, 36, 32, 33, 38, 36}, + { 0, 63, 53, 37, 32, 32, 34, 37}, + { 0, 62, 55, 37, 33, 32, 32, 39}, + { 0, 58, 59, 37, 33, 32, 31, 40}, + { 0, 53, 63, 38, 33, 32, 31, 40}, + { 0, 49, 64, 40, 33, 33, 30, 41}, + { 0, 46, 62, 42, 33, 33, 30, 42} + }, { - { 39, 34, 33, 58, 44, 31, 32}, - { 60, 38, 32, 40, 51, 30, 31}, - { 73, 49, 31, 39, 48, 32, 31}, - { 60, 73, 30, 39, 46, 33, 32}, - { 43, 87, 35, 38, 45, 33, 32}, - { 35, 78, 54, 36, 45, 33, 32}, - { 33, 47, 86, 35, 44, 33, 32}, - { 31, 17, 114, 34, 44, 34, 33}, - { 43, 37, 32, 53, 70, 30, 31}, - { 53, 50, 30, 42, 72, 31, 30}, - { 52, 66, 30, 39, 70, 32, 30}, - { 46, 78, 35, 37, 68, 34, 30}, - { 43, 75, 48, 37, 66, 34, 30}, - { 40, 62, 68, 35, 65, 35, 30}, - { 33, 37, 97, 33, 62, 37, 31}, - { 26, 14, 122, 32, 59, 38, 33}, - { 40, 39, 33, 34, 87, 37, 30}, - { 45, 54, 32, 34, 84, 41, 29}, - { 41, 70, 35, 33, 83, 40, 29}, - { 37, 73, 44, 32, 82, 40, 30}, - { 37, 65, 60, 31, 81, 41, 29}, - { 35, 48, 82, 30, 79, 43, 29}, - { 28, 27, 108, 28, 76, 45, 30}, - { 19, 11, 127, 27, 70, 46, 32}, - { 38, 40, 34, 27, 73, 62, 28}, - { 39, 54, 35, 30, 73, 62, 28}, - { 33, 65, 41, 29, 75, 59, 28}, - { 30, 65, 53, 27, 76, 58, 29}, - { 29, 53, 72, 26, 77, 58, 29}, - { 27, 35, 95, 24, 77, 60, 28}, - { 19, 19, 117, 23, 74, 61, 30}, - { 9, 16, 127, 23, 68, 60, 34}, - { 35, 40, 35, 29, 44, 89, 30}, - { 33, 51, 39, 29, 49, 86, 30}, - { 28, 57, 49, 28, 53, 83, 30}, - { 24, 52, 65, 26, 56, 82, 30}, - { 22, 39, 86, 24, 58, 82, 30}, - { 18, 22, 108, 23, 59, 82, 31}, - { 10, 13, 125, 22, 58, 80, 33}, - { 0, 19, 127, 22, 56, 74, 40}, - { 33, 40, 36, 31, 28, 90, 45}, - { 29, 46, 44, 29, 31, 92, 43}, - { 24, 45, 58, 28, 34, 91, 43}, - { 19, 37, 78, 26, 37, 91, 43}, - { 15, 22, 99, 25, 38, 91, 42}, - { 11, 11, 118, 24, 39, 90, 44}, - { 2, 11, 127, 23, 41, 85, 48}, - { 0, 17, 127, 23, 43, 75, 55}, - { 31, 37, 39, 30, 28, 54, 82}, - { 27, 37, 52, 28, 30, 58, 79}, - { 22, 30, 70, 27, 32, 58, 79}, - { 15, 19, 91, 26, 33, 58, 79}, - { 10, 8, 111, 25, 34, 58, 79}, - { 5, 2, 125, 25, 35, 57, 80}, - { 0, 9, 127, 25, 36, 53, 84}, - { 0, 13, 127, 25, 39, 47, 88}, - { 28, 29, 46, 28, 39, 2, 123}, - { 24, 24, 62, 27, 41, 1, 125}, - { 19, 14, 81, 25, 43, 0, 126}, - { 13, 4, 101, 24, 44, 0, 127}, - { 6, 0, 116, 23, 45, 0, 127}, - { 0, 0, 126, 23, 45, 1, 127}, - { 0, 4, 127, 25, 44, 2, 127}, - { 0, 9, 127, 25, 44, 3, 127} + { 0, 39, 34, 33, 58, 44, 31, 32}, + { 0, 60, 38, 32, 40, 51, 30, 31}, + { 0, 73, 49, 31, 39, 48, 32, 31}, + { 0, 60, 73, 30, 39, 46, 33, 32}, + { 0, 43, 87, 35, 38, 45, 33, 32}, + { 0, 35, 78, 54, 36, 45, 33, 32}, + { 0, 33, 47, 86, 35, 44, 33, 32}, + { 0, 31, 17, 114, 34, 44, 34, 33}, + { 0, 43, 37, 32, 53, 70, 30, 31}, + { 0, 53, 50, 30, 42, 72, 31, 30}, + { 0, 52, 66, 30, 39, 70, 32, 30}, + { 0, 46, 78, 35, 37, 68, 34, 30}, + { 0, 43, 75, 48, 37, 66, 34, 30}, + { 0, 40, 62, 68, 35, 65, 35, 30}, + { 0, 33, 37, 97, 33, 62, 37, 31}, + { 0, 26, 14, 122, 32, 59, 38, 33}, + { 0, 40, 39, 33, 34, 87, 37, 30}, + { 0, 45, 54, 32, 34, 84, 41, 29}, + { 0, 41, 70, 35, 33, 83, 40, 29}, + { 0, 37, 73, 44, 32, 82, 40, 30}, + { 0, 37, 65, 60, 31, 81, 41, 29}, + { 0, 35, 48, 82, 30, 79, 43, 29}, + { 0, 28, 27, 108, 28, 76, 45, 30}, + { 0, 19, 11, 127, 27, 70, 46, 32}, + { 0, 38, 40, 34, 27, 73, 62, 28}, + { 0, 39, 54, 35, 30, 73, 62, 28}, + { 0, 33, 65, 41, 29, 75, 59, 28}, + { 0, 30, 65, 53, 27, 76, 58, 29}, + { 0, 29, 53, 72, 26, 77, 58, 29}, + { 0, 27, 35, 95, 24, 77, 60, 28}, + { 0, 19, 19, 117, 23, 74, 61, 30}, + { 0, 9, 16, 127, 23, 68, 60, 34}, + { 0, 35, 40, 35, 29, 44, 89, 30}, + { 0, 33, 51, 39, 29, 49, 86, 30}, + { 0, 28, 57, 49, 28, 53, 83, 30}, + { 0, 24, 52, 65, 26, 56, 82, 30}, + { 0, 22, 39, 86, 24, 58, 82, 30}, + { 0, 18, 22, 108, 23, 59, 82, 31}, + { 0, 10, 13, 125, 22, 58, 80, 33}, + { 0, 0, 19, 127, 22, 56, 74, 40}, + { 0, 33, 40, 36, 31, 28, 90, 45}, + { 0, 29, 46, 44, 29, 31, 92, 43}, + { 0, 24, 45, 58, 28, 34, 91, 43}, + { 0, 19, 37, 78, 26, 37, 91, 43}, + { 0, 15, 22, 99, 25, 38, 91, 42}, + { 0, 11, 11, 118, 24, 39, 90, 44}, + { 0, 2, 11, 127, 23, 41, 85, 48}, + { 0, 0, 17, 127, 23, 43, 75, 55}, + { 0, 31, 37, 39, 30, 28, 54, 82}, + { 0, 27, 37, 52, 28, 30, 58, 79}, + { 0, 22, 30, 70, 27, 32, 58, 79}, + { 0, 15, 19, 91, 26, 33, 58, 79}, + { 0, 10, 8, 111, 25, 34, 58, 79}, + { 0, 5, 2, 125, 25, 35, 57, 80}, + { 0, 0, 9, 127, 25, 36, 53, 84}, + { 0, 0, 13, 127, 25, 39, 47, 88}, + { 0, 28, 29, 46, 28, 39, 2, 123}, + { 0, 24, 24, 62, 27, 41, 1, 125}, + { 0, 19, 14, 81, 25, 43, 0, 126}, + { 0, 13, 4, 101, 24, 44, 0, 127}, + { 0, 6, 0, 116, 23, 45, 0, 127}, + { 0, 0, 0, 126, 23, 45, 1, 127}, + { 0, 0, 4, 127, 25, 44, 2, 127}, + { 0, 0, 9, 127, 25, 44, 3, 127} }, { - { 30, 32, 32, 42, 34, 32, 32}, - { 63, 26, 34, 16, 38, 32, 32}, - { 98, 26, 34, 25, 34, 33, 32}, - { 75, 61, 30, 31, 32, 33, 32}, - { 36, 94, 32, 30, 33, 32, 32}, - { 26, 76, 58, 30, 33, 32, 32}, - { 30, 39, 91, 31, 32, 33, 31}, - { 32, 23, 105, 32, 32, 32, 32}, - { 34, 30, 33, 31, 52, 29, 32}, - { 66, 24, 34, 11, 41, 33, 32}, - { 97, 28, 34, 24, 34, 33, 32}, - { 71, 65, 30, 30, 32, 33, 32}, - { 34, 92, 35, 30, 33, 32, 32}, - { 26, 70, 64, 29, 34, 32, 32}, - { 30, 37, 94, 30, 33, 32, 31}, - { 32, 23, 105, 31, 33, 33, 31}, - { 37, 29, 33, 8, 79, 27, 32}, - { 71, 22, 35, 5, 50, 32, 32}, - { 98, 29, 34, 23, 34, 34, 32}, - { 66, 70, 30, 31, 31, 33, 32}, - { 31, 92, 38, 30, 33, 32, 32}, - { 26, 66, 68, 29, 34, 32, 31}, - { 30, 34, 97, 30, 34, 33, 31}, - { 31, 22, 106, 30, 34, 33, 31}, - { 40, 28, 34, 0, 76, 46, 28}, - { 76, 21, 35, 0, 55, 35, 32}, - { 97, 32, 34, 21, 37, 33, 33}, - { 61, 75, 29, 30, 32, 32, 32}, - { 29, 92, 40, 29, 33, 32, 32}, - { 26, 62, 73, 29, 34, 32, 31}, - { 29, 32, 99, 30, 34, 33, 30}, - { 31, 22, 107, 30, 34, 33, 31}, - { 42, 27, 34, 1, 48, 79, 25}, - { 80, 20, 35, 0, 48, 47, 31}, - { 94, 36, 32, 17, 40, 33, 33}, - { 55, 80, 29, 27, 35, 31, 32}, - { 27, 90, 43, 28, 34, 32, 31}, - { 26, 58, 76, 29, 33, 33, 30}, - { 29, 30, 101, 29, 34, 34, 30}, - { 31, 21, 108, 29, 35, 34, 30}, - { 44, 26, 34, 6, 30, 80, 40}, - { 81, 21, 35, 0, 41, 52, 35}, - { 90, 41, 31, 14, 41, 35, 33}, - { 51, 82, 29, 24, 37, 32, 32}, - { 27, 87, 47, 27, 35, 32, 31}, - { 26, 54, 79, 29, 34, 33, 30}, - { 29, 29, 102, 28, 34, 33, 30}, - { 31, 21, 108, 28, 35, 33, 31}, - { 47, 26, 34, 7, 34, 44, 75}, - { 80, 24, 34, 0, 41, 41, 50}, - { 84, 45, 31, 12, 40, 36, 36}, - { 49, 81, 31, 22, 37, 33, 32}, - { 28, 81, 51, 26, 35, 33, 31}, - { 28, 51, 81, 28, 34, 33, 30}, - { 29, 30, 101, 28, 35, 33, 31}, - { 31, 22, 107, 28, 35, 33, 32}, - { 48, 27, 34, 10, 40, 16, 97}, - { 75, 27, 34, 3, 42, 26, 66}, - { 77, 47, 33, 12, 40, 32, 43}, - { 49, 75, 36, 21, 37, 33, 35}, - { 32, 72, 55, 25, 36, 33, 32}, - { 30, 49, 81, 27, 35, 33, 31}, - { 30, 32, 98, 28, 35, 32, 32}, - { 31, 24, 104, 28, 35, 32, 33} + { 0, 30, 32, 32, 42, 34, 32, 32}, + { 0, 63, 26, 34, 16, 38, 32, 32}, + { 0, 98, 26, 34, 25, 34, 33, 32}, + { 0, 75, 61, 30, 31, 32, 33, 32}, + { 0, 36, 94, 32, 30, 33, 32, 32}, + { 0, 26, 76, 58, 30, 33, 32, 32}, + { 0, 30, 39, 91, 31, 32, 33, 31}, + { 0, 32, 23, 105, 32, 32, 32, 32}, + { 0, 34, 30, 33, 31, 52, 29, 32}, + { 0, 66, 24, 34, 11, 41, 33, 32}, + { 0, 97, 28, 34, 24, 34, 33, 32}, + { 0, 71, 65, 30, 30, 32, 33, 32}, + { 0, 34, 92, 35, 30, 33, 32, 32}, + { 0, 26, 70, 64, 29, 34, 32, 32}, + { 0, 30, 37, 94, 30, 33, 32, 31}, + { 0, 32, 23, 105, 31, 33, 33, 31}, + { 0, 37, 29, 33, 8, 79, 27, 32}, + { 0, 71, 22, 35, 5, 50, 32, 32}, + { 0, 98, 29, 34, 23, 34, 34, 32}, + { 0, 66, 70, 30, 31, 31, 33, 32}, + { 0, 31, 92, 38, 30, 33, 32, 32}, + { 0, 26, 66, 68, 29, 34, 32, 31}, + { 0, 30, 34, 97, 30, 34, 33, 31}, + { 0, 31, 22, 106, 30, 34, 33, 31}, + { 0, 40, 28, 34, 0, 76, 46, 28}, + { 0, 76, 21, 35, 0, 55, 35, 32}, + { 0, 97, 32, 34, 21, 37, 33, 33}, + { 0, 61, 75, 29, 30, 32, 32, 32}, + { 0, 29, 92, 40, 29, 33, 32, 32}, + { 0, 26, 62, 73, 29, 34, 32, 31}, + { 0, 29, 32, 99, 30, 34, 33, 30}, + { 0, 31, 22, 107, 30, 34, 33, 31}, + { 0, 42, 27, 34, 1, 48, 79, 25}, + { 0, 80, 20, 35, 0, 48, 47, 31}, + { 0, 94, 36, 32, 17, 40, 33, 33}, + { 0, 55, 80, 29, 27, 35, 31, 32}, + { 0, 27, 90, 43, 28, 34, 32, 31}, + { 0, 26, 58, 76, 29, 33, 33, 30}, + { 0, 29, 30, 101, 29, 34, 34, 30}, + { 0, 31, 21, 108, 29, 35, 34, 30}, + { 0, 44, 26, 34, 6, 30, 80, 40}, + { 0, 81, 21, 35, 0, 41, 52, 35}, + { 0, 90, 41, 31, 14, 41, 35, 33}, + { 0, 51, 82, 29, 24, 37, 32, 32}, + { 0, 27, 87, 47, 27, 35, 32, 31}, + { 0, 26, 54, 79, 29, 34, 33, 30}, + { 0, 29, 29, 102, 28, 34, 33, 30}, + { 0, 31, 21, 108, 28, 35, 33, 31}, + { 0, 47, 26, 34, 7, 34, 44, 75}, + { 0, 80, 24, 34, 0, 41, 41, 50}, + { 0, 84, 45, 31, 12, 40, 36, 36}, + { 0, 49, 81, 31, 22, 37, 33, 32}, + { 0, 28, 81, 51, 26, 35, 33, 31}, + { 0, 28, 51, 81, 28, 34, 33, 30}, + { 0, 29, 30, 101, 28, 35, 33, 31}, + { 0, 31, 22, 107, 28, 35, 33, 32}, + { 0, 48, 27, 34, 10, 40, 16, 97}, + { 0, 75, 27, 34, 3, 42, 26, 66}, + { 0, 77, 47, 33, 12, 40, 32, 43}, + { 0, 49, 75, 36, 21, 37, 33, 35}, + { 0, 32, 72, 55, 25, 36, 33, 32}, + { 0, 30, 49, 81, 27, 35, 33, 31}, + { 0, 30, 32, 98, 28, 35, 32, 32}, + { 0, 31, 24, 104, 28, 35, 32, 33} }, { - { 36, 29, 33, 43, 47, 29, 31}, - { 74, 20, 35, 19, 47, 34, 32}, - { 92, 35, 32, 29, 31, 40, 34}, - { 53, 80, 26, 33, 28, 36, 37}, - { 24, 91, 41, 31, 31, 31, 38}, - { 25, 57, 74, 31, 32, 30, 37}, - { 32, 28, 99, 32, 32, 29, 36}, - { 34, 20, 105, 33, 32, 30, 35}, - { 50, 26, 34, 33, 74, 30, 31}, - { 75, 28, 33, 23, 46, 47, 33}, - { 64, 58, 29, 30, 26, 46, 40}, - { 31, 85, 37, 31, 27, 33, 44}, - { 22, 67, 64, 30, 31, 28, 42}, - { 29, 35, 93, 31, 32, 27, 40}, - { 33, 20, 105, 32, 33, 27, 37}, - { 34, 19, 106, 33, 32, 29, 36}, - { 51, 29, 33, 25, 72, 51, 30}, - { 61, 42, 31, 30, 31, 60, 39}, - { 40, 70, 34, 32, 24, 41, 50}, - { 22, 72, 54, 30, 31, 27, 50}, - { 25, 44, 83, 30, 33, 25, 44}, - { 32, 23, 102, 32, 33, 26, 40}, - { 34, 18, 107, 32, 33, 28, 37}, - { 34, 19, 105, 33, 32, 30, 35}, - { 45, 35, 32, 30, 39, 79, 33}, - { 43, 53, 33, 35, 24, 53, 55}, - { 27, 67, 45, 32, 29, 27, 61}, - { 22, 53, 72, 30, 33, 22, 52}, - { 28, 31, 95, 31, 33, 25, 43}, - { 32, 20, 105, 32, 33, 27, 38}, - { 34, 18, 107, 32, 32, 29, 36}, - { 34, 20, 105, 33, 31, 31, 35}, - { 38, 40, 32, 35, 23, 72, 54}, - { 31, 55, 39, 34, 29, 32, 73}, - { 22, 57, 60, 31, 35, 18, 64}, - { 25, 39, 86, 31, 35, 22, 49}, - { 30, 24, 101, 32, 33, 27, 40}, - { 33, 19, 106, 32, 32, 30, 36}, - { 34, 18, 107, 33, 31, 31, 35}, - { 34, 20, 104, 33, 31, 32, 34}, - { 33, 42, 35, 34, 28, 39, 82}, - { 26, 51, 50, 33, 34, 18, 80}, - { 23, 46, 74, 31, 35, 20, 59}, - { 27, 32, 93, 32, 34, 26, 44}, - { 31, 22, 103, 32, 32, 30, 37}, - { 33, 19, 106, 33, 31, 31, 35}, - { 34, 19, 106, 33, 31, 32, 34}, - { 35, 21, 103, 34, 31, 32, 34}, - { 29, 41, 41, 33, 34, 20, 92}, - { 24, 44, 62, 34, 35, 18, 73}, - { 24, 37, 83, 34, 33, 25, 52}, - { 28, 28, 97, 33, 32, 30, 40}, - { 32, 23, 103, 33, 31, 32, 36}, - { 34, 20, 105, 34, 30, 33, 34}, - { 35, 20, 104, 34, 30, 33, 33}, - { 35, 22, 102, 34, 30, 33, 34}, - { 27, 38, 51, 34, 34, 20, 86}, - { 26, 37, 71, 35, 34, 24, 64}, - { 27, 33, 87, 35, 32, 30, 47}, - { 30, 28, 96, 34, 31, 32, 39}, - { 32, 24, 100, 35, 30, 32, 36}, - { 34, 23, 101, 34, 30, 33, 34}, - { 35, 23, 101, 34, 30, 32, 34}, - { 34, 24, 99, 35, 30, 33, 34} + { 0, 36, 29, 33, 43, 47, 29, 31}, + { 0, 74, 20, 35, 19, 47, 34, 32}, + { 0, 92, 35, 32, 29, 31, 40, 34}, + { 0, 53, 80, 26, 33, 28, 36, 37}, + { 0, 24, 91, 41, 31, 31, 31, 38}, + { 0, 25, 57, 74, 31, 32, 30, 37}, + { 0, 32, 28, 99, 32, 32, 29, 36}, + { 0, 34, 20, 105, 33, 32, 30, 35}, + { 0, 50, 26, 34, 33, 74, 30, 31}, + { 0, 75, 28, 33, 23, 46, 47, 33}, + { 0, 64, 58, 29, 30, 26, 46, 40}, + { 0, 31, 85, 37, 31, 27, 33, 44}, + { 0, 22, 67, 64, 30, 31, 28, 42}, + { 0, 29, 35, 93, 31, 32, 27, 40}, + { 0, 33, 20, 105, 32, 33, 27, 37}, + { 0, 34, 19, 106, 33, 32, 29, 36}, + { 0, 51, 29, 33, 25, 72, 51, 30}, + { 0, 61, 42, 31, 30, 31, 60, 39}, + { 0, 40, 70, 34, 32, 24, 41, 50}, + { 0, 22, 72, 54, 30, 31, 27, 50}, + { 0, 25, 44, 83, 30, 33, 25, 44}, + { 0, 32, 23, 102, 32, 33, 26, 40}, + { 0, 34, 18, 107, 32, 33, 28, 37}, + { 0, 34, 19, 105, 33, 32, 30, 35}, + { 0, 45, 35, 32, 30, 39, 79, 33}, + { 0, 43, 53, 33, 35, 24, 53, 55}, + { 0, 27, 67, 45, 32, 29, 27, 61}, + { 0, 22, 53, 72, 30, 33, 22, 52}, + { 0, 28, 31, 95, 31, 33, 25, 43}, + { 0, 32, 20, 105, 32, 33, 27, 38}, + { 0, 34, 18, 107, 32, 32, 29, 36}, + { 0, 34, 20, 105, 33, 31, 31, 35}, + { 0, 38, 40, 32, 35, 23, 72, 54}, + { 0, 31, 55, 39, 34, 29, 32, 73}, + { 0, 22, 57, 60, 31, 35, 18, 64}, + { 0, 25, 39, 86, 31, 35, 22, 49}, + { 0, 30, 24, 101, 32, 33, 27, 40}, + { 0, 33, 19, 106, 32, 32, 30, 36}, + { 0, 34, 18, 107, 33, 31, 31, 35}, + { 0, 34, 20, 104, 33, 31, 32, 34}, + { 0, 33, 42, 35, 34, 28, 39, 82}, + { 0, 26, 51, 50, 33, 34, 18, 80}, + { 0, 23, 46, 74, 31, 35, 20, 59}, + { 0, 27, 32, 93, 32, 34, 26, 44}, + { 0, 31, 22, 103, 32, 32, 30, 37}, + { 0, 33, 19, 106, 33, 31, 31, 35}, + { 0, 34, 19, 106, 33, 31, 32, 34}, + { 0, 35, 21, 103, 34, 31, 32, 34}, + { 0, 29, 41, 41, 33, 34, 20, 92}, + { 0, 24, 44, 62, 34, 35, 18, 73}, + { 0, 24, 37, 83, 34, 33, 25, 52}, + { 0, 28, 28, 97, 33, 32, 30, 40}, + { 0, 32, 23, 103, 33, 31, 32, 36}, + { 0, 34, 20, 105, 34, 30, 33, 34}, + { 0, 35, 20, 104, 34, 30, 33, 33}, + { 0, 35, 22, 102, 34, 30, 33, 34}, + { 0, 27, 38, 51, 34, 34, 20, 86}, + { 0, 26, 37, 71, 35, 34, 24, 64}, + { 0, 27, 33, 87, 35, 32, 30, 47}, + { 0, 30, 28, 96, 34, 31, 32, 39}, + { 0, 32, 24, 100, 35, 30, 32, 36}, + { 0, 34, 23, 101, 34, 30, 33, 34}, + { 0, 35, 23, 101, 34, 30, 32, 34}, + { 0, 34, 24, 99, 35, 30, 33, 34} }, { - { 39, 30, 31, 67, 33, 34, 31}, - { 72, 21, 32, 43, 39, 33, 31}, - { 100, 23, 32, 35, 39, 34, 31}, - { 75, 63, 24, 32, 38, 34, 32}, - { 32, 98, 26, 29, 37, 35, 32}, - { 22, 77, 55, 29, 36, 35, 31}, - { 31, 37, 90, 31, 35, 35, 32}, - { 35, 22, 100, 33, 33, 36, 33}, - { 47, 29, 32, 74, 54, 32, 31}, - { 71, 24, 32, 60, 50, 36, 30}, - { 86, 31, 30, 46, 48, 37, 30}, - { 65, 63, 25, 34, 46, 39, 30}, - { 33, 85, 32, 28, 43, 40, 30}, - { 26, 64, 60, 27, 39, 41, 30}, - { 33, 33, 87, 29, 35, 41, 31}, - { 37, 23, 93, 32, 33, 41, 32}, - { 41, 32, 32, 45, 84, 32, 32}, - { 55, 31, 32, 50, 70, 40, 30}, - { 62, 37, 31, 45, 61, 45, 29}, - { 53, 55, 31, 36, 55, 48, 29}, - { 38, 63, 40, 29, 48, 50, 28}, - { 34, 49, 60, 27, 43, 51, 29}, - { 38, 30, 78, 28, 38, 50, 31}, - { 40, 24, 83, 30, 36, 48, 33}, - { 35, 33, 33, 29, 75, 58, 29}, - { 39, 35, 33, 34, 68, 59, 29}, - { 41, 39, 34, 36, 61, 62, 29}, - { 41, 43, 37, 33, 54, 64, 28}, - { 41, 43, 45, 30, 48, 65, 29}, - { 42, 36, 56, 27, 44, 63, 30}, - { 42, 30, 65, 27, 41, 60, 33}, - { 42, 28, 68, 28, 37, 56, 36}, - { 33, 34, 33, 31, 42, 88, 30}, - { 31, 36, 34, 31, 44, 84, 31}, - { 31, 37, 35, 32, 43, 83, 31}, - { 35, 35, 39, 32, 40, 82, 31}, - { 40, 32, 44, 31, 38, 81, 31}, - { 44, 30, 48, 30, 37, 78, 33}, - { 44, 30, 52, 28, 37, 72, 36}, - { 43, 30, 55, 29, 35, 66, 40}, - { 32, 33, 33, 34, 25, 85, 48}, - { 30, 34, 34, 33, 25, 88, 44}, - { 30, 34, 36, 34, 25, 90, 41}, - { 33, 32, 38, 34, 25, 90, 40}, - { 38, 29, 41, 34, 26, 88, 40}, - { 42, 29, 41, 33, 27, 85, 41}, - { 43, 30, 42, 31, 28, 80, 43}, - { 42, 31, 45, 31, 30, 72, 47}, - { 32, 33, 33, 33, 26, 54, 79}, - { 31, 32, 34, 35, 20, 68, 68}, - { 32, 32, 35, 36, 17, 76, 62}, - { 34, 31, 36, 36, 17, 79, 59}, - { 37, 29, 37, 36, 18, 78, 58}, - { 39, 29, 37, 35, 20, 77, 58}, - { 41, 30, 37, 34, 22, 74, 58}, - { 40, 31, 40, 32, 26, 68, 59}, - { 33, 31, 34, 33, 29, 31, 98}, - { 34, 30, 34, 35, 23, 45, 88}, - { 34, 31, 34, 36, 20, 54, 82}, - { 35, 31, 34, 36, 18, 59, 78}, - { 36, 31, 34, 37, 19, 60, 76}, - { 38, 30, 34, 36, 20, 61, 74}, - { 39, 31, 35, 35, 22, 60, 73}, - { 39, 31, 37, 34, 24, 59, 71} + { 0, 39, 30, 31, 67, 33, 34, 31}, + { 0, 72, 21, 32, 43, 39, 33, 31}, + { 0, 100, 23, 32, 35, 39, 34, 31}, + { 0, 75, 63, 24, 32, 38, 34, 32}, + { 0, 32, 98, 26, 29, 37, 35, 32}, + { 0, 22, 77, 55, 29, 36, 35, 31}, + { 0, 31, 37, 90, 31, 35, 35, 32}, + { 0, 35, 22, 100, 33, 33, 36, 33}, + { 0, 47, 29, 32, 74, 54, 32, 31}, + { 0, 71, 24, 32, 60, 50, 36, 30}, + { 0, 86, 31, 30, 46, 48, 37, 30}, + { 0, 65, 63, 25, 34, 46, 39, 30}, + { 0, 33, 85, 32, 28, 43, 40, 30}, + { 0, 26, 64, 60, 27, 39, 41, 30}, + { 0, 33, 33, 87, 29, 35, 41, 31}, + { 0, 37, 23, 93, 32, 33, 41, 32}, + { 0, 41, 32, 32, 45, 84, 32, 32}, + { 0, 55, 31, 32, 50, 70, 40, 30}, + { 0, 62, 37, 31, 45, 61, 45, 29}, + { 0, 53, 55, 31, 36, 55, 48, 29}, + { 0, 38, 63, 40, 29, 48, 50, 28}, + { 0, 34, 49, 60, 27, 43, 51, 29}, + { 0, 38, 30, 78, 28, 38, 50, 31}, + { 0, 40, 24, 83, 30, 36, 48, 33}, + { 0, 35, 33, 33, 29, 75, 58, 29}, + { 0, 39, 35, 33, 34, 68, 59, 29}, + { 0, 41, 39, 34, 36, 61, 62, 29}, + { 0, 41, 43, 37, 33, 54, 64, 28}, + { 0, 41, 43, 45, 30, 48, 65, 29}, + { 0, 42, 36, 56, 27, 44, 63, 30}, + { 0, 42, 30, 65, 27, 41, 60, 33}, + { 0, 42, 28, 68, 28, 37, 56, 36}, + { 0, 33, 34, 33, 31, 42, 88, 30}, + { 0, 31, 36, 34, 31, 44, 84, 31}, + { 0, 31, 37, 35, 32, 43, 83, 31}, + { 0, 35, 35, 39, 32, 40, 82, 31}, + { 0, 40, 32, 44, 31, 38, 81, 31}, + { 0, 44, 30, 48, 30, 37, 78, 33}, + { 0, 44, 30, 52, 28, 37, 72, 36}, + { 0, 43, 30, 55, 29, 35, 66, 40}, + { 0, 32, 33, 33, 34, 25, 85, 48}, + { 0, 30, 34, 34, 33, 25, 88, 44}, + { 0, 30, 34, 36, 34, 25, 90, 41}, + { 0, 33, 32, 38, 34, 25, 90, 40}, + { 0, 38, 29, 41, 34, 26, 88, 40}, + { 0, 42, 29, 41, 33, 27, 85, 41}, + { 0, 43, 30, 42, 31, 28, 80, 43}, + { 0, 42, 31, 45, 31, 30, 72, 47}, + { 0, 32, 33, 33, 33, 26, 54, 79}, + { 0, 31, 32, 34, 35, 20, 68, 68}, + { 0, 32, 32, 35, 36, 17, 76, 62}, + { 0, 34, 31, 36, 36, 17, 79, 59}, + { 0, 37, 29, 37, 36, 18, 78, 58}, + { 0, 39, 29, 37, 35, 20, 77, 58}, + { 0, 41, 30, 37, 34, 22, 74, 58}, + { 0, 40, 31, 40, 32, 26, 68, 59}, + { 0, 33, 31, 34, 33, 29, 31, 98}, + { 0, 34, 30, 34, 35, 23, 45, 88}, + { 0, 34, 31, 34, 36, 20, 54, 82}, + { 0, 35, 31, 34, 36, 18, 59, 78}, + { 0, 36, 31, 34, 37, 19, 60, 76}, + { 0, 38, 30, 34, 36, 20, 61, 74}, + { 0, 39, 31, 35, 35, 22, 60, 73}, + { 0, 39, 31, 37, 34, 24, 59, 71} }, { - { 30, 33, 32, 55, 32, 32, 32}, - { 47, 30, 31, 29, 36, 32, 32}, - { 81, 28, 32, 28, 34, 32, 32}, - { 85, 46, 29, 32, 32, 33, 32}, - { 54, 82, 26, 32, 32, 33, 32}, - { 30, 90, 38, 31, 32, 33, 32}, - { 30, 56, 73, 31, 33, 32, 32}, - { 37, 21, 102, 32, 32, 32, 32}, - { 33, 32, 31, 68, 39, 31, 31}, - { 38, 32, 31, 43, 34, 33, 31}, - { 63, 30, 31, 29, 34, 32, 32}, - { 82, 37, 30, 29, 33, 32, 32}, - { 71, 63, 27, 31, 32, 33, 32}, - { 44, 86, 30, 30, 33, 33, 32}, - { 33, 72, 55, 30, 32, 32, 31}, - { 37, 37, 86, 31, 32, 33, 31}, - { 34, 33, 32, 60, 61, 29, 32}, - { 36, 33, 31, 56, 38, 32, 31}, - { 51, 30, 31, 38, 33, 33, 32}, - { 75, 31, 31, 30, 33, 33, 32}, - { 80, 47, 29, 30, 32, 33, 31}, - { 60, 73, 27, 30, 33, 33, 31}, - { 41, 78, 41, 30, 33, 32, 31}, - { 38, 53, 68, 30, 32, 33, 31}, - { 33, 33, 32, 43, 77, 35, 30}, - { 35, 33, 31, 55, 54, 29, 32}, - { 43, 32, 31, 46, 39, 31, 32}, - { 64, 30, 31, 35, 34, 33, 32}, - { 79, 37, 30, 31, 32, 33, 31}, - { 73, 57, 28, 30, 32, 33, 31}, - { 54, 73, 33, 30, 32, 33, 31}, - { 43, 64, 52, 30, 32, 33, 31}, - { 33, 33, 32, 34, 68, 58, 28}, - { 34, 33, 31, 45, 70, 33, 31}, - { 38, 33, 31, 48, 52, 29, 32}, - { 54, 31, 31, 40, 39, 31, 32}, - { 73, 32, 31, 34, 34, 33, 31}, - { 77, 45, 29, 31, 32, 32, 32}, - { 65, 63, 30, 31, 31, 33, 31}, - { 51, 66, 42, 30, 32, 33, 31}, - { 33, 32, 32, 34, 44, 81, 31}, - { 34, 33, 31, 38, 66, 52, 28}, - { 36, 33, 30, 44, 62, 34, 31}, - { 47, 31, 31, 43, 48, 30, 32}, - { 64, 31, 31, 38, 38, 32, 32}, - { 75, 38, 30, 33, 34, 32, 32}, - { 71, 53, 30, 31, 32, 33, 32}, - { 59, 61, 37, 30, 32, 33, 32}, - { 33, 32, 31, 35, 31, 71, 54}, - { 34, 33, 31, 37, 49, 70, 33}, - { 36, 33, 31, 41, 60, 48, 30}, - { 43, 32, 31, 43, 54, 35, 31}, - { 56, 31, 31, 40, 44, 32, 32}, - { 68, 35, 30, 36, 37, 32, 32}, - { 70, 45, 30, 33, 34, 33, 32}, - { 63, 55, 35, 31, 33, 33, 32}, - { 33, 32, 31, 33, 34, 36, 87}, - { 34, 32, 31, 36, 38, 62, 52}, - { 36, 33, 31, 39, 50, 57, 36}, - { 41, 33, 31, 41, 53, 43, 33}, - { 50, 33, 31, 41, 48, 36, 32}, - { 59, 35, 31, 37, 41, 34, 32}, - { 65, 42, 31, 35, 36, 33, 32}, - { 62, 49, 35, 33, 34, 34, 33} + { 0, 30, 33, 32, 55, 32, 32, 32}, + { 0, 47, 30, 31, 29, 36, 32, 32}, + { 0, 81, 28, 32, 28, 34, 32, 32}, + { 0, 85, 46, 29, 32, 32, 33, 32}, + { 0, 54, 82, 26, 32, 32, 33, 32}, + { 0, 30, 90, 38, 31, 32, 33, 32}, + { 0, 30, 56, 73, 31, 33, 32, 32}, + { 0, 37, 21, 102, 32, 32, 32, 32}, + { 0, 33, 32, 31, 68, 39, 31, 31}, + { 0, 38, 32, 31, 43, 34, 33, 31}, + { 0, 63, 30, 31, 29, 34, 32, 32}, + { 0, 82, 37, 30, 29, 33, 32, 32}, + { 0, 71, 63, 27, 31, 32, 33, 32}, + { 0, 44, 86, 30, 30, 33, 33, 32}, + { 0, 33, 72, 55, 30, 32, 32, 31}, + { 0, 37, 37, 86, 31, 32, 33, 31}, + { 0, 34, 33, 32, 60, 61, 29, 32}, + { 0, 36, 33, 31, 56, 38, 32, 31}, + { 0, 51, 30, 31, 38, 33, 33, 32}, + { 0, 75, 31, 31, 30, 33, 33, 32}, + { 0, 80, 47, 29, 30, 32, 33, 31}, + { 0, 60, 73, 27, 30, 33, 33, 31}, + { 0, 41, 78, 41, 30, 33, 32, 31}, + { 0, 38, 53, 68, 30, 32, 33, 31}, + { 0, 33, 33, 32, 43, 77, 35, 30}, + { 0, 35, 33, 31, 55, 54, 29, 32}, + { 0, 43, 32, 31, 46, 39, 31, 32}, + { 0, 64, 30, 31, 35, 34, 33, 32}, + { 0, 79, 37, 30, 31, 32, 33, 31}, + { 0, 73, 57, 28, 30, 32, 33, 31}, + { 0, 54, 73, 33, 30, 32, 33, 31}, + { 0, 43, 64, 52, 30, 32, 33, 31}, + { 0, 33, 33, 32, 34, 68, 58, 28}, + { 0, 34, 33, 31, 45, 70, 33, 31}, + { 0, 38, 33, 31, 48, 52, 29, 32}, + { 0, 54, 31, 31, 40, 39, 31, 32}, + { 0, 73, 32, 31, 34, 34, 33, 31}, + { 0, 77, 45, 29, 31, 32, 32, 32}, + { 0, 65, 63, 30, 31, 31, 33, 31}, + { 0, 51, 66, 42, 30, 32, 33, 31}, + { 0, 33, 32, 32, 34, 44, 81, 31}, + { 0, 34, 33, 31, 38, 66, 52, 28}, + { 0, 36, 33, 30, 44, 62, 34, 31}, + { 0, 47, 31, 31, 43, 48, 30, 32}, + { 0, 64, 31, 31, 38, 38, 32, 32}, + { 0, 75, 38, 30, 33, 34, 32, 32}, + { 0, 71, 53, 30, 31, 32, 33, 32}, + { 0, 59, 61, 37, 30, 32, 33, 32}, + { 0, 33, 32, 31, 35, 31, 71, 54}, + { 0, 34, 33, 31, 37, 49, 70, 33}, + { 0, 36, 33, 31, 41, 60, 48, 30}, + { 0, 43, 32, 31, 43, 54, 35, 31}, + { 0, 56, 31, 31, 40, 44, 32, 32}, + { 0, 68, 35, 30, 36, 37, 32, 32}, + { 0, 70, 45, 30, 33, 34, 33, 32}, + { 0, 63, 55, 35, 31, 33, 33, 32}, + { 0, 33, 32, 31, 33, 34, 36, 87}, + { 0, 34, 32, 31, 36, 38, 62, 52}, + { 0, 36, 33, 31, 39, 50, 57, 36}, + { 0, 41, 33, 31, 41, 53, 43, 33}, + { 0, 50, 33, 31, 41, 48, 36, 32}, + { 0, 59, 35, 31, 37, 41, 34, 32}, + { 0, 65, 42, 31, 35, 36, 33, 32}, + { 0, 62, 49, 35, 33, 34, 34, 33} } }; diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d181c959..6be77b21 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4604,17 +4604,13 @@ void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, const uint8_t* weight = matrix; const int input_offset = transpose ? in_offset_tr : in_offset; - const bool red_size = (size_id == 2); int pos_res = 0; // Reduced prediction size is 4 or 8 for (int y = 0; y < red_pred_size; y++) { for (int x = 0; x < red_pred_size; x++) { - if (red_size) { - weight -= 1; - } // Use 16-bit intermediates - int tmp0 = red_size ? 0 : (input[0] * weight[0]); + int tmp0 = input[0] * weight[0]; int tmp1 = input[1] * weight[1]; int tmp2 = input[2] * weight[2]; int tmp3 = input[3] * weight[3]; @@ -4640,6 +4636,62 @@ void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, } +// Size ID 2 +void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, + const int16_t* const input, + const uint8_t* matrix, + const bool transpose, + const int in_offset, + const int in_offset_tr) +{ + const int input_size = 8; + const int pred_size = 8; + const int size_id = 2; + + // Use local buffer for transposed result + uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH + uvg_pixel* const out_ptr = transpose ? out_buf_transposed : output; + + int sum = 0; + for (int i = 0; i < input_size; i++) { + sum += input[i]; + } + const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; + + const uint8_t* weight = matrix; + const int input_offset = transpose ? in_offset_tr : in_offset; + + int pos_res = 0; + + // Reduced prediction size is 4 or 8 + for (int y = 0; y < pred_size; y++) { + for (int x = 0; x < pred_size; x++) { + int tmp0 = input[0] * weight[0]; + int tmp1 = input[1] * weight[1]; + int tmp2 = input[2] * weight[2]; + int tmp3 = input[3] * weight[3]; + + tmp0 += input[4] * weight[4]; + tmp1 += input[5] * weight[5]; + tmp2 += input[6] * weight[6]; + tmp3 += input[7] * weight[7]; + + out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); + pos_res++; + weight += input_size; + } + } + + if (transpose) { + for (int y = 0; y < pred_size; y++) { + for (int x = 0; x < pred_size; x++) { + output[y * pred_size + x] = out_ptr[x * pred_size + y]; + } + } + } +} + + void uvg_mip_pred_upsampling_1D_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, const uint8_t src_step, const uint8_t src_stride, @@ -4811,7 +4863,14 @@ void mip_predict_avx2( const uvg_pixel* const reduced_bdry = transpose ? red_bdry_trans : red_bdry; const int16_t* const reduced_bdry16 = transpose ? red_bdry_trans16 : red_bdry16; - uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); + switch (size_id) { + case 0: uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); break; + case 1: uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); break; + case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix, transpose, input_offset, input_offset_trans); break; + default: + assert(false && "Intra MIP: invalid size id.\n"); + break; + } if (need_upsampling) { const uvg_pixel* ver_src = reduced_pred; uint16_t ver_src_step = width; diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c index c00a6dfa..324a2b41 100644 --- a/src/strategies/generic/intra-generic.c +++ b/src/strategies/generic/intra-generic.c @@ -516,15 +516,11 @@ void uvg_mip_reduced_pred(int* const output, const uint8_t* weight = matrix; const int input_offset = transpose ? in_offset_tr : in_offset; - - const bool red_size = (size_id == 2); + int pos_res = 0; for (int y = 0; y < red_pred_size; y++) { for (int x = 0; x < red_pred_size; x++) { - if (red_size) { - weight -= 1; - } - int tmp0 = red_size ? 0 : (input[0] * weight[0]); + int tmp0 = input[0] * weight[0]; int tmp1 = input[1] * weight[1]; int tmp2 = input[2] * weight[2]; int tmp3 = input[3] * weight[3]; From e45dbe4efe565849e2843a23f8a02ce26cc2be45 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 19 Mar 2024 16:51:12 +0200 Subject: [PATCH 110/237] Implement intra avx2 mip size id 2. --- src/mip_data.h | 198 +++++++++++++++++++++++++++++++ src/strategies/avx2/intra-avx2.c | 149 +++++++++++++++++++---- 2 files changed, 325 insertions(+), 22 deletions(-) diff --git a/src/mip_data.h b/src/mip_data.h index a6549c34..6ed0d8aa 100644 --- a/src/mip_data.h +++ b/src/mip_data.h @@ -881,3 +881,201 @@ ALIGNED(32) static const uint8_t uvg_mip_matrix_16x16[6][64][8] = { 0, 62, 49, 35, 33, 34, 34, 33} } }; + + + +// Weight vectors for MIP size_id 2. +static ALIGNED(32) const uint16_t uvg_mip_sid2_weights[] = { + 0, 42, 0, 71, 0, 77, 0, 64, 37, 33, 39, 34, 46, 35, 60, 35, // mode 0, offset 0 + 27, 44, 24, 36, 33, 30, 33, 31, 33, 35, 35, 36, 34, 36, 32, 36, + 0, 49, 0, 42, 0, 40, 0, 38, 71, 38, 66, 50, 52, 67, 43, 75, + 32, 32, 33, 31, 33, 31, 33, 32, 31, 36, 32, 36, 32, 35, 32, 35, + 0, 56, 0, 70, 0, 65, 0, 59, 40, 33, 49, 34, 57, 36, 60, 39, + 26, 43, 30, 28, 34, 28, 33, 30, 38, 36, 38, 38, 33, 39, 31, 38, + 0, 55, 0, 51, 0, 46, 0, 42, 60, 43, 61, 47, 62, 51, 60, 55, + 33, 30, 33, 30, 34, 30, 33, 31, 31, 38, 32, 37, 32, 37, 32, 37, + 0, 60, 0, 68, 0, 62, 0, 58, 42, 34, 52, 35, 58, 37, 59, 41, + 30, 37, 35, 22, 34, 28, 33, 30, 43, 38, 37, 40, 31, 40, 30, 39, + 0, 56, 0, 53, 0, 49, 0, 45, 59, 44, 60, 45, 65, 45, 64, 47, + 34, 30, 33, 30, 33, 30, 33, 31, 31, 38, 31, 38, 31, 38, 32, 38, + 0, 59, 0, 66, 0, 61, 0, 59, 44, 35, 53, 36, 58, 38, 57, 41, + 31, 34, 35, 25, 34, 29, 33, 30, 43, 41, 31, 43, 30, 40, 31, 39, + 0, 57, 0, 54, 0, 51, 0, 48, 58, 43, 61, 43, 64, 43, 64, 45, + 33, 30, 33, 31, 33, 31, 33, 32, 31, 39, 31, 39, 31, 39, 31, 39, + 0, 57, 0, 65, 0, 63, 0, 61, 45, 35, 54, 37, 56, 38, 56, 41, + 30, 35, 33, 33, 34, 30, 34, 30, 40, 44, 24, 44, 29, 39, 32, 39, + 0, 58, 0, 54, 0, 51, 0, 48, 58, 42, 62, 41, 65, 42, 63, 43, + 33, 31, 33, 31, 33, 31, 33, 32, 31, 39, 31, 39, 31, 39, 31, 39, + 0, 55, 0, 65, 0, 65, 0, 63, 46, 35, 53, 37, 54, 38, 55, 39, + 30, 36, 32, 36, 33, 31, 33, 30, 38, 47, 26, 40, 30, 38, 32, 38, + 0, 59, 0, 54, 0, 49, 0, 48, 58, 40, 64, 40, 66, 40, 64, 42, + 33, 31, 33, 31, 32, 32, 32, 32, 31, 39, 30, 40, 30, 41, 30, 41, + 0, 54, 0, 64, 0, 65, 0, 63, 46, 35, 52, 36, 53, 37, 55, 38, + 30, 34, 32, 34, 33, 32, 33, 31, 39, 49, 34, 35, 32, 37, 31, 39, + 0, 59, 0, 54, 0, 49, 0, 47, 60, 38, 64, 38, 66, 39, 64, 42, + 33, 31, 33, 32, 33, 32, 32, 33, 31, 40, 30, 40, 29, 41, 29, 42, + 0, 51, 0, 61, 0, 63, 0, 62, 46, 35, 51, 36, 53, 37, 55, 37, + 31, 33, 32, 33, 32, 32, 33, 32, 37, 54, 38, 36, 34, 37, 32, 39, + 0, 58, 0, 53, 0, 49, 0, 46, 59, 37, 63, 38, 64, 40, 62, 42, + 33, 32, 33, 32, 33, 33, 33, 33, 31, 40, 31, 40, 30, 41, 30, 42, + 0, 39, 0, 60, 0, 73, 0, 60, 34, 33, 38, 32, 49, 31, 73, 30, // mode 1, offset 512 + 58, 44, 40, 51, 39, 48, 39, 46, 31, 32, 30, 31, 32, 31, 33, 32, + 0, 43, 0, 35, 0, 33, 0, 31, 87, 35, 78, 54, 47, 86, 17, 114, + 38, 45, 36, 45, 35, 44, 34, 44, 33, 32, 33, 32, 33, 32, 34, 33, + 0, 43, 0, 53, 0, 52, 0, 46, 37, 32, 50, 30, 66, 30, 78, 35, + 53, 70, 42, 72, 39, 70, 37, 68, 30, 31, 31, 30, 32, 30, 34, 30, + 0, 43, 0, 40, 0, 33, 0, 26, 75, 48, 62, 68, 37, 97, 14, 122, + 37, 66, 35, 65, 33, 62, 32, 59, 34, 30, 35, 30, 37, 31, 38, 33, + 0, 40, 0, 45, 0, 41, 0, 37, 39, 33, 54, 32, 70, 35, 73, 44, + 34, 87, 34, 84, 33, 83, 32, 82, 37, 30, 41, 29, 40, 29, 40, 30, + 0, 37, 0, 35, 0, 28, 0, 19, 65, 60, 48, 82, 27, 108, 11, 127, + 31, 81, 30, 79, 28, 76, 27, 70, 41, 29, 43, 29, 45, 30, 46, 32, + 0, 38, 0, 39, 0, 33, 0, 30, 40, 34, 54, 35, 65, 41, 65, 53, + 27, 73, 30, 73, 29, 75, 27, 76, 62, 28, 62, 28, 59, 28, 58, 29, + 0, 29, 0, 27, 0, 19, 0, 9, 53, 72, 35, 95, 19, 117, 16, 127, + 26, 77, 24, 77, 23, 74, 23, 68, 58, 29, 60, 28, 61, 30, 60, 34, + 0, 35, 0, 33, 0, 28, 0, 24, 40, 35, 51, 39, 57, 49, 52, 65, + 29, 44, 29, 49, 28, 53, 26, 56, 89, 30, 86, 30, 83, 30, 82, 30, + 0, 22, 0, 18, 0, 10, 0, 0, 39, 86, 22, 108, 13, 125, 19, 127, + 24, 58, 23, 59, 22, 58, 22, 56, 82, 30, 82, 31, 80, 33, 74, 40, + 0, 33, 0, 29, 0, 24, 0, 19, 40, 36, 46, 44, 45, 58, 37, 78, + 31, 28, 29, 31, 28, 34, 26, 37, 90, 45, 92, 43, 91, 43, 91, 43, + 0, 15, 0, 11, 0, 2, 0, 0, 22, 99, 11, 118, 11, 127, 17, 127, + 25, 38, 24, 39, 23, 41, 23, 43, 91, 42, 90, 44, 85, 48, 75, 55, + 0, 31, 0, 27, 0, 22, 0, 15, 37, 39, 37, 52, 30, 70, 19, 91, + 30, 28, 28, 30, 27, 32, 26, 33, 54, 82, 58, 79, 58, 79, 58, 79, + 0, 10, 0, 5, 0, 0, 0, 0, 8, 111, 2, 125, 9, 127, 13, 127, + 25, 34, 25, 35, 25, 36, 25, 39, 58, 79, 57, 80, 53, 84, 47, 88, + 0, 28, 0, 24, 0, 19, 0, 13, 29, 46, 24, 62, 14, 81, 4, 101, + 28, 39, 27, 41, 25, 43, 24, 44, 2, 123, 1, 125, 0, 126, 0, 127, + 0, 6, 0, 0, 0, 0, 0, 0, 0, 116, 0, 126, 4, 127, 9, 127, + 23, 45, 23, 45, 25, 44, 25, 44, 0, 127, 1, 127, 2, 127, 3, 127, + 0, 30, 0, 63, 0, 98, 0, 75, 32, 32, 26, 34, 26, 34, 61, 30, // mode 2, offset 1024 + 42, 34, 16, 38, 25, 34, 31, 32, 32, 32, 32, 32, 33, 32, 33, 32, + 0, 36, 0, 26, 0, 30, 0, 32, 94, 32, 76, 58, 39, 91, 23, 105, + 30, 33, 30, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, + 0, 34, 0, 66, 0, 97, 0, 71, 30, 33, 24, 34, 28, 34, 65, 30, + 31, 52, 11, 41, 24, 34, 30, 32, 29, 32, 33, 32, 33, 32, 33, 32, + 0, 34, 0, 26, 0, 30, 0, 32, 92, 35, 70, 64, 37, 94, 23, 105, + 30, 33, 29, 34, 30, 33, 31, 33, 32, 32, 32, 32, 32, 31, 33, 31, + 0, 37, 0, 71, 0, 98, 0, 66, 29, 33, 22, 35, 29, 34, 70, 30, + 8, 79, 5, 50, 23, 34, 31, 31, 27, 32, 32, 32, 34, 32, 33, 32, + 0, 31, 0, 26, 0, 30, 0, 31, 92, 38, 66, 68, 34, 97, 22, 106, + 30, 33, 29, 34, 30, 34, 30, 34, 32, 32, 32, 31, 33, 31, 33, 31, + 0, 40, 0, 76, 0, 97, 0, 61, 28, 34, 21, 35, 32, 34, 75, 29, + 0, 76, 0, 55, 21, 37, 30, 32, 46, 28, 35, 32, 33, 33, 32, 32, + 0, 29, 0, 26, 0, 29, 0, 31, 92, 40, 62, 73, 32, 99, 22, 107, + 29, 33, 29, 34, 30, 34, 30, 34, 32, 32, 32, 31, 33, 30, 33, 31, + 0, 42, 0, 80, 0, 94, 0, 55, 27, 34, 20, 35, 36, 32, 80, 29, + 1, 48, 0, 48, 17, 40, 27, 35, 79, 25, 47, 31, 33, 33, 31, 32, + 0, 27, 0, 26, 0, 29, 0, 31, 90, 43, 58, 76, 30, 101, 21, 108, + 28, 34, 29, 33, 29, 34, 29, 35, 32, 31, 33, 30, 34, 30, 34, 30, + 0, 44, 0, 81, 0, 90, 0, 51, 26, 34, 21, 35, 41, 31, 82, 29, + 6, 30, 0, 41, 14, 41, 24, 37, 80, 40, 52, 35, 35, 33, 32, 32, + 0, 27, 0, 26, 0, 29, 0, 31, 87, 47, 54, 79, 29, 102, 21, 108, + 27, 35, 29, 34, 28, 34, 28, 35, 32, 31, 33, 30, 33, 30, 33, 31, + 0, 47, 0, 80, 0, 84, 0, 49, 26, 34, 24, 34, 45, 31, 81, 31, + 7, 34, 0, 41, 12, 40, 22, 37, 44, 75, 41, 50, 36, 36, 33, 32, + 0, 28, 0, 28, 0, 29, 0, 31, 81, 51, 51, 81, 30, 101, 22, 107, + 26, 35, 28, 34, 28, 35, 28, 35, 33, 31, 33, 30, 33, 31, 33, 32, + 0, 48, 0, 75, 0, 77, 0, 49, 27, 34, 27, 34, 47, 33, 75, 36, + 10, 40, 3, 42, 12, 40, 21, 37, 16, 97, 26, 66, 32, 43, 33, 35, + 0, 32, 0, 30, 0, 30, 0, 31, 72, 55, 49, 81, 32, 98, 24, 104, + 25, 36, 27, 35, 28, 35, 28, 35, 33, 32, 33, 31, 32, 32, 32, 33, + 0, 36, 0, 74, 0, 92, 0, 53, 29, 33, 20, 35, 35, 32, 80, 26, // mode 3, offset 1536 + 43, 47, 19, 47, 29, 31, 33, 28, 29, 31, 34, 32, 40, 34, 36, 37, + 0, 24, 0, 25, 0, 32, 0, 34, 91, 41, 57, 74, 28, 99, 20, 105, + 31, 31, 31, 32, 32, 32, 33, 32, 31, 38, 30, 37, 29, 36, 30, 35, + 0, 50, 0, 75, 0, 64, 0, 31, 26, 34, 28, 33, 58, 29, 85, 37, + 33, 74, 23, 46, 30, 26, 31, 27, 30, 31, 47, 33, 46, 40, 33, 44, + 0, 22, 0, 29, 0, 33, 0, 34, 67, 64, 35, 93, 20, 105, 19, 106, + 30, 31, 31, 32, 32, 33, 33, 32, 28, 42, 27, 40, 27, 37, 29, 36, + 0, 51, 0, 61, 0, 40, 0, 22, 29, 33, 42, 31, 70, 34, 72, 54, + 25, 72, 30, 31, 32, 24, 30, 31, 51, 30, 60, 39, 41, 50, 27, 50, + 0, 25, 0, 32, 0, 34, 0, 34, 44, 83, 23, 102, 18, 107, 19, 105, + 30, 33, 32, 33, 32, 33, 33, 32, 25, 44, 26, 40, 28, 37, 30, 35, + 0, 45, 0, 43, 0, 27, 0, 22, 35, 32, 53, 33, 67, 45, 53, 72, + 30, 39, 35, 24, 32, 29, 30, 33, 79, 33, 53, 55, 27, 61, 22, 52, + 0, 28, 0, 32, 0, 34, 0, 34, 31, 95, 20, 105, 18, 107, 20, 105, + 31, 33, 32, 33, 32, 32, 33, 31, 25, 43, 27, 38, 29, 36, 31, 35, + 0, 38, 0, 31, 0, 22, 0, 25, 40, 32, 55, 39, 57, 60, 39, 86, + 35, 23, 34, 29, 31, 35, 31, 35, 72, 54, 32, 73, 18, 64, 22, 49, + 0, 30, 0, 33, 0, 34, 0, 34, 24, 101, 19, 106, 18, 107, 20, 104, + 32, 33, 32, 32, 33, 31, 33, 31, 27, 40, 30, 36, 31, 35, 32, 34, + 0, 33, 0, 26, 0, 23, 0, 27, 42, 35, 51, 50, 46, 74, 32, 93, + 34, 28, 33, 34, 31, 35, 32, 34, 39, 82, 18, 80, 20, 59, 26, 44, + 0, 31, 0, 33, 0, 34, 0, 35, 22, 103, 19, 106, 19, 106, 21, 103, + 32, 32, 33, 31, 33, 31, 34, 31, 30, 37, 31, 35, 32, 34, 32, 34, + 0, 29, 0, 24, 0, 24, 0, 28, 41, 41, 44, 62, 37, 83, 28, 97, + 33, 34, 34, 35, 34, 33, 33, 32, 20, 92, 18, 73, 25, 52, 30, 40, + 0, 32, 0, 34, 0, 35, 0, 35, 23, 103, 20, 105, 20, 104, 22, 102, + 33, 31, 34, 30, 34, 30, 34, 30, 32, 36, 33, 34, 33, 33, 33, 34, + 0, 27, 0, 26, 0, 27, 0, 30, 38, 51, 37, 71, 33, 87, 28, 96, + 34, 34, 35, 34, 35, 32, 34, 31, 20, 86, 24, 64, 30, 47, 32, 39, + 0, 32, 0, 34, 0, 35, 0, 34, 24, 100, 23, 101, 23, 101, 24, 99, + 35, 30, 34, 30, 34, 30, 35, 30, 32, 36, 33, 34, 32, 34, 33, 34, + 0, 39, 0, 72, 0, 100, 0, 75, 30, 31, 21, 32, 23, 32, 63, 24, // mode 4, offset 2048 + 67, 33, 43, 39, 35, 39, 32, 38, 34, 31, 33, 31, 34, 31, 34, 32, + 0, 32, 0, 22, 0, 31, 0, 35, 98, 26, 77, 55, 37, 90, 22, 100, + 29, 37, 29, 36, 31, 35, 33, 33, 35, 32, 35, 31, 35, 32, 36, 33, + 0, 47, 0, 71, 0, 86, 0, 65, 29, 32, 24, 32, 31, 30, 63, 25, + 74, 54, 60, 50, 46, 48, 34, 46, 32, 31, 36, 30, 37, 30, 39, 30, + 0, 33, 0, 26, 0, 33, 0, 37, 85, 32, 64, 60, 33, 87, 23, 93, + 28, 43, 27, 39, 29, 35, 32, 33, 40, 30, 41, 30, 41, 31, 41, 32, + 0, 41, 0, 55, 0, 62, 0, 53, 32, 32, 31, 32, 37, 31, 55, 31, + 45, 84, 50, 70, 45, 61, 36, 55, 32, 32, 40, 30, 45, 29, 48, 29, + 0, 38, 0, 34, 0, 38, 0, 40, 63, 40, 49, 60, 30, 78, 24, 83, + 29, 48, 27, 43, 28, 38, 30, 36, 50, 28, 51, 29, 50, 31, 48, 33, + 0, 35, 0, 39, 0, 41, 0, 41, 33, 33, 35, 33, 39, 34, 43, 37, + 29, 75, 34, 68, 36, 61, 33, 54, 58, 29, 59, 29, 62, 29, 64, 28, + 0, 41, 0, 42, 0, 42, 0, 42, 43, 45, 36, 56, 30, 65, 28, 68, + 30, 48, 27, 44, 27, 41, 28, 37, 65, 29, 63, 30, 60, 33, 56, 36, + 0, 33, 0, 31, 0, 31, 0, 35, 34, 33, 36, 34, 37, 35, 35, 39, + 31, 42, 31, 44, 32, 43, 32, 40, 88, 30, 84, 31, 83, 31, 82, 31, + 0, 40, 0, 44, 0, 44, 0, 43, 32, 44, 30, 48, 30, 52, 30, 55, + 31, 38, 30, 37, 28, 37, 29, 35, 81, 31, 78, 33, 72, 36, 66, 40, + 0, 32, 0, 30, 0, 30, 0, 33, 33, 33, 34, 34, 34, 36, 32, 38, + 34, 25, 33, 25, 34, 25, 34, 25, 85, 48, 88, 44, 90, 41, 90, 40, + 0, 38, 0, 42, 0, 43, 0, 42, 29, 41, 29, 41, 30, 42, 31, 45, + 34, 26, 33, 27, 31, 28, 31, 30, 88, 40, 85, 41, 80, 43, 72, 47, + 0, 32, 0, 31, 0, 32, 0, 34, 33, 33, 32, 34, 32, 35, 31, 36, + 33, 26, 35, 20, 36, 17, 36, 17, 54, 79, 68, 68, 76, 62, 79, 59, + 0, 37, 0, 39, 0, 41, 0, 40, 29, 37, 29, 37, 30, 37, 31, 40, + 36, 18, 35, 20, 34, 22, 32, 26, 78, 58, 77, 58, 74, 58, 68, 59, + 0, 33, 0, 34, 0, 34, 0, 35, 31, 34, 30, 34, 31, 34, 31, 34, + 33, 29, 35, 23, 36, 20, 36, 18, 31, 98, 45, 88, 54, 82, 59, 78, + 0, 36, 0, 38, 0, 39, 0, 39, 31, 34, 30, 34, 31, 35, 31, 37, + 37, 19, 36, 20, 35, 22, 34, 24, 60, 76, 61, 74, 60, 73, 59, 71, + 0, 30, 0, 47, 0, 81, 0, 85, 33, 32, 30, 31, 28, 32, 46, 29, // mode 5, offset 2560 + 55, 32, 29, 36, 28, 34, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 0, 54, 0, 30, 0, 30, 0, 37, 82, 26, 90, 38, 56, 73, 21, 102, + 32, 32, 31, 32, 31, 33, 32, 32, 33, 32, 33, 32, 32, 32, 32, 32, + 0, 33, 0, 38, 0, 63, 0, 82, 32, 31, 32, 31, 30, 31, 37, 30, + 68, 39, 43, 34, 29, 34, 29, 33, 31, 31, 33, 31, 32, 32, 32, 32, + 0, 71, 0, 44, 0, 33, 0, 37, 63, 27, 86, 30, 72, 55, 37, 86, + 31, 32, 30, 33, 30, 32, 31, 32, 33, 32, 33, 32, 32, 31, 33, 31, + 0, 34, 0, 36, 0, 51, 0, 75, 33, 32, 33, 31, 30, 31, 31, 31, + 60, 61, 56, 38, 38, 33, 30, 33, 29, 32, 32, 31, 33, 32, 33, 32, + 0, 80, 0, 60, 0, 41, 0, 38, 47, 29, 73, 27, 78, 41, 53, 68, + 30, 32, 30, 33, 30, 33, 30, 32, 33, 31, 33, 31, 32, 31, 33, 31, + 0, 33, 0, 35, 0, 43, 0, 64, 33, 32, 33, 31, 32, 31, 30, 31, + 43, 77, 55, 54, 46, 39, 35, 34, 35, 30, 29, 32, 31, 32, 33, 32, + 0, 79, 0, 73, 0, 54, 0, 43, 37, 30, 57, 28, 73, 33, 64, 52, + 31, 32, 30, 32, 30, 32, 30, 32, 33, 31, 33, 31, 33, 31, 33, 31, + 0, 33, 0, 34, 0, 38, 0, 54, 33, 32, 33, 31, 33, 31, 31, 31, + 34, 68, 45, 70, 48, 52, 40, 39, 58, 28, 33, 31, 29, 32, 31, 32, + 0, 73, 0, 77, 0, 65, 0, 51, 32, 31, 45, 29, 63, 30, 66, 42, + 34, 34, 31, 32, 31, 31, 30, 32, 33, 31, 32, 32, 33, 31, 33, 31, + 0, 33, 0, 34, 0, 36, 0, 47, 32, 32, 33, 31, 33, 30, 31, 31, + 34, 44, 38, 66, 44, 62, 43, 48, 81, 31, 52, 28, 34, 31, 30, 32, + 0, 64, 0, 75, 0, 71, 0, 59, 31, 31, 38, 30, 53, 30, 61, 37, + 38, 38, 33, 34, 31, 32, 30, 32, 32, 32, 32, 32, 33, 32, 33, 32, + 0, 33, 0, 34, 0, 36, 0, 43, 32, 31, 33, 31, 33, 31, 32, 31, + 35, 31, 37, 49, 41, 60, 43, 54, 71, 54, 70, 33, 48, 30, 35, 31, + 0, 56, 0, 68, 0, 70, 0, 63, 31, 31, 35, 30, 45, 30, 55, 35, + 40, 44, 36, 37, 33, 34, 31, 33, 32, 32, 32, 32, 33, 32, 33, 32, + 0, 33, 0, 34, 0, 36, 0, 41, 32, 31, 32, 31, 33, 31, 33, 31, + 33, 34, 36, 38, 39, 50, 41, 53, 36, 87, 62, 52, 57, 36, 43, 33, + 0, 50, 0, 59, 0, 65, 0, 62, 33, 31, 35, 31, 42, 31, 49, 35, + 41, 48, 37, 41, 35, 36, 33, 34, 36, 32, 34, 32, 33, 32, 34, 33, +}; \ No newline at end of file diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 6be77b21..a8ea46a9 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4592,7 +4592,7 @@ void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, // Use local buffer for transposed result uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH - uvg_pixel* const out_ptr = transpose ? out_buf_transposed : output; + uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; int sum = 0; for (int i = 0; i < input_size; i++) { @@ -4639,7 +4639,7 @@ void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, // Size ID 2 void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const int16_t* const input, - const uint8_t* matrix, + const uint16_t* matrix, const bool transpose, const int in_offset, const int in_offset_tr) @@ -4650,7 +4650,7 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, // Use local buffer for transposed result uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH - uvg_pixel* const out_ptr = transpose ? out_buf_transposed : output; + uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; int sum = 0; for (int i = 0; i < input_size; i++) { @@ -4658,28 +4658,131 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, } const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; - const uint8_t* weight = matrix; + const __m128i vofs = _mm_set1_epi32(offset); + + const uint16_t* weight = matrix; const int input_offset = transpose ? in_offset_tr : in_offset; - int pos_res = 0; + const __m128i vinofs = _mm_set1_epi32(input_offset); + + const __m128i vshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03); + const __m128i vshuf1 = _mm_setr_epi8( + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07); + const __m128i vshuf2 = _mm_setr_epi8( + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b); + const __m128i vshuf3 = _mm_setr_epi8( + 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f); + + const __m128i vinraw = _mm_loadu_si128((__m128i*)input); + + const __m128i vin0 = _mm_shuffle_epi8(vinraw, vshuf0); + const __m128i vin1 = _mm_shuffle_epi8(vinraw, vshuf1); + const __m128i vin2 = _mm_shuffle_epi8(vinraw, vshuf2); + const __m128i vin3 = _mm_shuffle_epi8(vinraw, vshuf3); + - // Reduced prediction size is 4 or 8 - for (int y = 0; y < pred_size; y++) { - for (int x = 0; x < pred_size; x++) { - int tmp0 = input[0] * weight[0]; - int tmp1 = input[1] * weight[1]; - int tmp2 = input[2] * weight[2]; - int tmp3 = input[3] * weight[3]; + for (int y = 0; y < pred_size; y += 2) { + // Calculate row 1, first 4 + __m128i vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + __m128i vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + __m128i vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + __m128i vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); - tmp0 += input[4] * weight[4]; - tmp1 += input[5] * weight[5]; - tmp2 += input[6] * weight[6]; - tmp3 += input[7] * weight[7]; + __m128i vmadd0 = _mm_madd_epi16(vin0, vweight0); + __m128i vmadd1 = _mm_madd_epi16(vin1, vweight1); + __m128i vmadd2 = _mm_madd_epi16(vin2, vweight2); + __m128i vmadd3 = _mm_madd_epi16(vin3, vweight3); - out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); - pos_res++; - weight += input_size; - } + __m128i vadd0 = _mm_add_epi32(vmadd0, vmadd1); + __m128i vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + __m128i result0 = _mm_add_epi32(vadd0, vadd1); + + result0 = _mm_add_epi32(result0, vofs); + result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); + result0 = _mm_add_epi32(result0, vinofs); + + weight += input_size * 4; + + // Calculate row 1, last 4 + vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + __m128i result1 = _mm_add_epi32(vadd0, vadd1); + + result1 = _mm_add_epi32(result1, vofs); + result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); + result1 = _mm_add_epi32(result1, vinofs); + + __m128i vres16_a = _mm_packus_epi32(result0, result1); + + weight += input_size * 4; + + // Calculate row 2, first 4 + vweight0 = _mm_loadu_si128((__m128i*) & weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) & weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) & weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) & weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + result0 = _mm_add_epi32(vadd0, vadd1); + + result0 = _mm_add_epi32(result0, vofs); + result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); + result0 = _mm_add_epi32(result0, vinofs); + + weight += input_size * 4; + + // Calculate row 2, last 4 + vweight0 = _mm_loadu_si128((__m128i*) & weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) & weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) & weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) & weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + result1 = _mm_add_epi32(vadd0, vadd1); + + result1 = _mm_add_epi32(result1, vofs); + result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); + result1 = _mm_add_epi32(result1, vinofs); + + __m128i vres16_b = _mm_packus_epi32(result0, result1); + __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); + + _mm_storeu_si128((__m128i*)out_ptr, vres8); + + //out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); + out_ptr += 16; + weight += input_size * 4; } if (transpose) { @@ -4842,6 +4945,7 @@ void mip_predict_avx2( const bool transpose = mip_transp; const uint8_t* matrix = 0; + const uint16_t* matrix16 = 0; switch (size_id) { case 0: matrix = &uvg_mip_matrix_4x4[mode_idx][0][0]; @@ -4850,7 +4954,8 @@ void mip_predict_avx2( matrix = &uvg_mip_matrix_8x8[mode_idx][0][0]; break; case 2: - matrix = &uvg_mip_matrix_16x16[mode_idx][0][0]; + //matrix = &uvg_mip_matrix_16x16[mode_idx][0][0]; + matrix16 = &uvg_mip_sid2_weights[mode_idx * 512]; break; default: assert(false && "Invalid MIP size id."); @@ -4866,7 +4971,7 @@ void mip_predict_avx2( switch (size_id) { case 0: uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); break; case 1: uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); break; - case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix, transpose, input_offset, input_offset_trans); break; + case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; default: assert(false && "Intra MIP: invalid size id.\n"); break; From fbe9b8debcdfbe78d4759b84bc34b1c113137a35 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 20 Mar 2024 20:55:15 +0200 Subject: [PATCH 111/237] Implement mip size id 0 and 1. Size id 1 can use the same functions as size id 2, but with different weights. --- src/mip_data.h | 362 +++++++++++++++++++++++++++++++ src/strategies/avx2/intra-avx2.c | 291 +++++++++++++++++++++++-- 2 files changed, 635 insertions(+), 18 deletions(-) diff --git a/src/mip_data.h b/src/mip_data.h index 6ed0d8aa..79a2a3f9 100644 --- a/src/mip_data.h +++ b/src/mip_data.h @@ -882,6 +882,368 @@ ALIGNED(32) static const uint8_t uvg_mip_matrix_16x16[6][64][8] = } }; +// MIP weight tables for AVX2. + +// This is the same table as used in generic version, but 16-bit. +static ALIGNED(32) const uint16_t uvg_mip_sid0_weights[16][16][4] = +{ + { + { 32, 30, 90, 28}, + { 32, 32, 72, 28}, + { 34, 77, 53, 30}, + { 51, 124, 36, 37}, + { 31, 31, 95, 37}, + { 33, 31, 70, 50}, + { 52, 80, 25, 60}, + { 78, 107, 1, 65}, + { 31, 29, 37, 95}, + { 38, 34, 19, 101}, + { 73, 85, 0, 81}, + { 92, 99, 0, 65}, + { 34, 29, 14, 111}, + { 48, 48, 7, 100}, + { 80, 91, 0, 74}, + { 89, 97, 0, 64} + }, + { + { 31, 23, 34, 29}, + { 31, 43, 34, 31}, + { 30, 95, 34, 32}, + { 29, 100, 35, 33}, + { 31, 23, 34, 29}, + { 31, 43, 34, 31}, + { 30, 95, 34, 32}, + { 29, 99, 35, 33}, + { 31, 24, 35, 29}, + { 31, 44, 34, 31}, + { 30, 95, 35, 32}, + { 29, 99, 35, 33}, + { 31, 24, 35, 30}, + { 31, 44, 35, 31}, + { 30, 95, 35, 32}, + { 29, 99, 35, 33} + }, + { + { 32, 32, 36, 58}, + { 32, 29, 26, 66}, + { 36, 37, 23, 61}, + { 79, 84, 3, 37}, + { 32, 32, 30, 69}, + { 33, 29, 24, 71}, + { 44, 16, 21, 70}, + { 96, 18, 0, 57}, + { 32, 31, 24, 74}, + { 33, 30, 23, 71}, + { 36, 24, 24, 71}, + { 59, 9, 16, 68}, + { 32, 32, 23, 75}, + { 33, 30, 24, 70}, + { 32, 30, 25, 71}, + { 36, 26, 25, 70} + }, + { + { 32, 33, 34, 32}, + { 32, 30, 22, 38}, + { 29, 46, 25, 38}, + { 53, 123, 28, 22}, + { 32, 33, 30, 37}, + { 32, 30, 21, 38}, + { 32, 40, 24, 38}, + { 64, 116, 26, 17}, + { 32, 32, 23, 49}, + { 32, 30, 21, 39}, + { 34, 39, 24, 37}, + { 72, 109, 23, 16}, + { 33, 31, 17, 60}, + { 32, 31, 21, 39}, + { 35, 41, 24, 37}, + { 72, 106, 22, 18} + }, + { + { 34, 25, 89, 20}, + { 38, 32, 47, 24}, + { 40, 86, 29, 27}, + { 38, 98, 32, 29}, + { 34, 31, 94, 40}, + { 44, 25, 83, 27}, + { 54, 72, 43, 16}, + { 47, 94, 33, 22}, + { 33, 31, 36, 94}, + { 43, 23, 51, 76}, + { 62, 55, 64, 25}, + { 57, 89, 38, 15}, + { 32, 32, 28, 101}, + { 38, 26, 33, 94}, + { 55, 38, 68, 47}, + { 59, 80, 52, 16} + }, + { + { 28, 30, 68, 29}, + { 23, 48, 23, 48}, + { 39, 98, 16, 42}, + { 84, 86, 20, 17}, + { 25, 31, 52, 74}, + { 38, 68, 5, 70}, + { 95, 78, 7, 21}, + { 127, 54, 12, 0}, + { 30, 47, 14, 107}, + { 79, 76, 0, 53}, + { 127, 59, 7, 1}, + { 127, 51, 9, 0}, + { 50, 71, 1, 96}, + { 109, 69, 7, 25}, + { 127, 56, 9, 0}, + { 123, 53, 13, 0} + }, + { + { 40, 20, 72, 18}, + { 48, 29, 44, 18}, + { 53, 81, 35, 18}, + { 48, 96, 33, 22}, + { 45, 23, 79, 49}, + { 61, 21, 56, 49}, + { 72, 52, 32, 48}, + { 65, 69, 20, 50}, + { 41, 27, 29, 96}, + { 49, 22, 28, 94}, + { 52, 22, 28, 93}, + { 49, 27, 27, 92}, + { 37, 29, 26, 98}, + { 39, 28, 28, 97}, + { 38, 28, 30, 97}, + { 38, 29, 30, 95} + }, + { + { 33, 27, 43, 27}, + { 32, 29, 31, 31}, + { 31, 73, 33, 31}, + { 35, 104, 34, 28}, + { 32, 30, 63, 22}, + { 33, 26, 33, 29}, + { 33, 57, 33, 30}, + { 37, 100, 35, 27}, + { 32, 31, 85, 25}, + { 34, 25, 39, 25}, + { 35, 39, 32, 28}, + { 40, 91, 35, 25}, + { 32, 30, 77, 50}, + { 34, 26, 54, 22}, + { 37, 31, 34, 27}, + { 45, 75, 34, 23} + }, + { + { 34, 25, 77, 19}, + { 36, 34, 56, 24}, + { 41, 83, 39, 30}, + { 47, 96, 28, 35}, + { 34, 31, 70, 65}, + { 38, 29, 53, 77}, + { 43, 36, 37, 83}, + { 48, 39, 28, 83}, + { 33, 31, 31, 98}, + { 33, 31, 30, 99}, + { 34, 30, 31, 98}, + { 36, 29, 31, 96}, + { 32, 32, 30, 97}, + { 32, 32, 31, 96}, + { 31, 33, 33, 96}, + { 32, 33, 34, 94} + }, + { + { 30, 30, 93, 19}, + { 31, 59, 67, 34}, + { 31, 79, 36, 59}, + { 30, 67, 17, 79}, + { 30, 38, 68, 69}, + { 29, 40, 43, 91}, + { 26, 35, 32, 101}, + { 23, 32, 30, 101}, + { 26, 34, 30, 101}, + { 23, 33, 30, 102}, + { 20, 32, 31, 102}, + { 18, 33, 32, 102}, + { 23, 33, 31, 100}, + { 20, 34, 32, 100}, + { 18, 35, 33, 100}, + { 18, 35, 33, 100} + }, + { + { 31, 54, 90, 26}, + { 32, 60, 53, 61}, + { 34, 49, 37, 84}, + { 34, 39, 35, 89}, + { 35, 38, 41, 88}, + { 35, 35, 32, 96}, + { 35, 31, 33, 96}, + { 35, 32, 35, 94}, + { 34, 34, 30, 97}, + { 35, 32, 33, 95}, + { 35, 32, 34, 94}, + { 35, 34, 34, 93}, + { 34, 34, 34, 93}, + { 35, 34, 34, 93}, + { 35, 34, 34, 92}, + { 36, 34, 35, 91} + }, + { + { 32, 29, 54, 24}, + { 31, 32, 34, 29}, + { 31, 43, 34, 29}, + { 32, 67, 36, 28}, + { 31, 34, 69, 37}, + { 31, 35, 46, 33}, + { 30, 35, 39, 33}, + { 30, 42, 39, 36}, + { 31, 35, 39, 88}, + { 30, 38, 41, 84}, + { 30, 39, 40, 81}, + { 39, 46, 38, 78}, + { 31, 36, 34, 96}, + { 34, 38, 37, 93}, + { 55, 42, 38, 82}, + { 89, 53, 38, 65} + }, + { + { 32, 33, 43, 29}, + { 32, 30, 29, 33}, + { 31, 47, 31, 33}, + { 33, 100, 31, 31}, + { 32, 33, 74, 25}, + { 32, 32, 34, 31}, + { 32, 33, 30, 33}, + { 32, 68, 30, 32}, + { 32, 31, 91, 40}, + { 32, 32, 58, 26}, + { 31, 31, 30, 32}, + { 31, 42, 30, 33}, + { 32, 31, 49, 85}, + { 32, 31, 83, 35}, + { 31, 33, 48, 29}, + { 31, 36, 32, 33} + }, + { + { 31, 29, 81, 35}, + { 32, 28, 34, 50}, + { 31, 75, 16, 43}, + { 34, 103, 29, 32}, + { 32, 32, 53, 78}, + { 31, 28, 36, 88}, + { 30, 52, 18, 73}, + { 52, 88, 17, 35}, + { 32, 32, 35, 94}, + { 30, 31, 35, 95}, + { 36, 29, 31, 92}, + { 100, 43, 16, 40}, + { 32, 32, 35, 93}, + { 30, 32, 38, 93}, + { 55, 18, 37, 83}, + { 127, 0, 30, 40} + }, + { + { 31, 22, 47, 30}, + { 31, 48, 25, 34}, + { 30, 95, 31, 32}, + { 32, 103, 33, 32}, + { 30, 24, 57, 31}, + { 30, 47, 26, 34}, + { 31, 95, 31, 32}, + { 43, 97, 35, 25}, + { 29, 26, 44, 63}, + { 37, 38, 24, 47}, + { 74, 63, 28, 20}, + { 110, 58, 34, 3}, + { 46, 22, 5, 108}, + { 93, 5, 9, 77}, + { 127, 0, 17, 52}, + { 127, 0, 15, 50} + }, + { + { 32, 27, 68, 24}, + { 35, 23, 35, 28}, + { 35, 64, 29, 29}, + { 37, 104, 33, 28}, + { 32, 32, 91, 40}, + { 36, 23, 67, 36}, + { 49, 23, 39, 28}, + { 60, 67, 30, 20}, + { 32, 32, 36, 95}, + { 35, 29, 38, 93}, + { 50, 16, 30, 84}, + { 72, 16, 15, 65}, + { 32, 32, 27, 100}, + { 33, 32, 29, 100}, + { 37, 29, 30, 98}, + { 48, 21, 29, 90} + } +}; + +// Weight vectors for MIP size_id 1. +static ALIGNED(32) const uint16_t uvg_mip_sid1_weights[] = { + 30, 63, 30, 60, 29, 45, 30, 39, 46, 37, 66, 38, 74, 42, 62, 58, // mode 0, offset 0 + 25, 33, 32, 31, 32, 32, 32, 33, 33, 34, 32, 33, 32, 33, 32, 33, + 30, 66, 29, 54, 28, 48, 28, 41, 55, 39, 69, 40, 71, 43, 72, 46, + 32, 30, 33, 31, 32, 33, 32, 34, 30, 36, 31, 33, 32, 33, 32, 33, + 30, 66, 29, 55, 27, 46, 27, 42, 56, 40, 69, 39, 72, 43, 69, 48, + 32, 33, 33, 33, 33, 33, 32, 34, 28, 33, 30, 32, 32, 33, 32, 33, + 30, 63, 29, 56, 27, 47, 27, 42, 55, 40, 66, 40, 69, 44, 65, 50, + 32, 33, 33, 33, 33, 33, 32, 34, 35, 30, 33, 30, 33, 32, 32, 33, + 32, 33, 33, 56, 33, 77, 33, 37, 30, 31, 28, 30, 52, 26, 80, 41, // mode 1, offset 128 + 74, 30, 41, 29, 29, 34, 31, 34, 31, 32, 32, 32, 30, 32, 30, 32, + 32, 32, 33, 31, 33, 47, 33, 61, 33, 31, 31, 30, 28, 29, 44, 28, + 59, 76, 78, 40, 53, 27, 34, 32, 28, 31, 28, 32, 31, 31, 31, 31, + 32, 31, 32, 31, 33, 27, 33, 33, 34, 30, 34, 29, 34, 29, 34, 30, + 26, 64, 45, 86, 73, 55, 62, 33, 76, 27, 36, 29, 25, 32, 30, 31, + 32, 31, 32, 31, 32, 30, 33, 28, 34, 30, 35, 29, 36, 29, 37, 30, + 30, 29, 27, 53, 40, 80, 58, 60, 58, 74, 77, 35, 44, 31, 31, 33, + 32, 51, 32, 95, 32, 27, 32, 34, 27, 32, 42, 29, 99, 34, 21, 104, // mode 2, offset 256 + 27, 50, 29, 42, 31, 41, 31, 42, 29, 32, 30, 32, 29, 32, 30, 32, + 32, 45, 32, 77, 32, 38, 32, 30, 30, 32, 38, 30, 78, 33, 30, 87, + 9, 88, 9, 76, 14, 67, 20, 59, 40, 30, 38, 30, 37, 30, 38, 31, + 33, 37, 34, 44, 36, 39, 37, 31, 32, 32, 34, 31, 45, 31, 31, 54, + 27, 18, 25, 17, 24, 15, 25, 14, 106, 34, 108, 31, 108, 30, 101, 32, + 36, 33, 39, 32, 44, 33, 47, 30, 32, 30, 32, 29, 31, 27, 31, 32, + 29, 37, 27, 37, 25, 37, 25, 34, 13, 110, 15, 108, 16, 106, 19, 102, + 32, 48, 32, 33, 32, 29, 33, 33, 35, 35, 59, 40, 47, 65, 31, 81, // mode 3, offset 384 + 47, 68, 27, 71, 24, 62, 26, 50, 31, 31, 33, 30, 37, 30, 42, 32, + 32, 30, 32, 20, 33, 30, 36, 34, 40, 38, 46, 50, 29, 66, 27, 69, + 30, 70, 26, 55, 25, 41, 26, 31, 55, 31, 64, 31, 72, 33, 67, 39, + 33, 28, 36, 27, 43, 30, 51, 27, 36, 40, 33, 50, 26, 57, 28, 55, + 30, 26, 31, 20, 28, 17, 22, 23, 85, 47, 79, 53, 67, 62, 49, 70, + 38, 29, 51, 31, 69, 23, 77, 13, 32, 39, 28, 43, 30, 40, 35, 38, + 28, 30, 24, 31, 15, 38, 8, 43, 22, 104, 17, 102, 10, 95, 8, 90, + 32, 38, 32, 40, 32, 37, 33, 34, 32, 33, 37, 32, 46, 35, 30, 62, // mode 4, offset 512 +101, 40, 100, 36, 94, 33, 81, 35, 29, 32, 30, 32, 30, 31, 30, 31, + 32, 32, 32, 31, 33, 33, 33, 32, 33, 32, 33, 33, 33, 33, 34, 36, + 22, 102, 26, 104, 31, 103, 37, 94, 39, 29, 34, 28, 32, 28, 33, 28, + 32, 33, 32, 34, 33, 33, 33, 33, 32, 32, 33, 33, 34, 33, 33, 36, + 34, 24, 33, 30, 31, 37, 30, 46, 99, 36, 98, 32, 95, 29, 85, 31, + 32, 33, 32, 34, 32, 33, 33, 33, 32, 33, 33, 33, 34, 34, 32, 37, + 30, 34, 31, 32, 31, 29, 32, 30, 23, 104, 30, 98, 39, 91, 47, 82, + 32, 52, 33, 19, 33, 30, 34, 35, 48, 31, 62, 50, 20, 74, 23, 56, // mode 5, offset 640 + 38, 76, 25, 50, 29, 29, 31, 25, 26, 32, 51, 31, 54, 51, 41, 76, + 33, 25, 35, 28, 37, 35, 38, 32, 38, 39, 25, 47, 22, 38, 33, 29, + 28, 39, 31, 23, 31, 27, 30, 31, 83, 35, 57, 74, 30, 101, 27, 103, + 34, 32, 38, 33, 40, 32, 40, 32, 27, 37, 28, 32, 33, 27, 34, 27, + 32, 25, 30, 31, 29, 33, 28, 33, 41, 92, 18, 111, 18, 111, 23, 105, + 35, 32, 38, 31, 40, 32, 40, 32, 30, 33, 33, 30, 33, 29, 33, 30, + 31, 33, 29, 33, 29, 34, 29, 34, 20, 107, 21, 106, 22, 105, 24, 101, + 32, 28, 33, 30, 33, 60, 33, 63, 31, 33, 28, 33, 26, 33, 44, 36, // mode 6, offset 768 + 92, 33, 71, 26, 47, 28, 37, 31, 30, 31, 32, 30, 33, 30, 33, 30, + 33, 30, 33, 28, 33, 30, 33, 38, 31, 33, 29, 34, 26, 33, 29, 32, + 43, 90, 71, 71, 86, 45, 74, 32, 33, 29, 26, 30, 28, 30, 33, 29, + 33, 32, 34, 31, 34, 31, 33, 32, 30, 32, 29, 33, 29, 33, 28, 34, + 29, 41, 26, 71, 37, 88, 55, 75, 95, 27, 73, 22, 46, 25, 36, 28, + 34, 31, 35, 32, 34, 33, 34, 34, 30, 32, 28, 33, 28, 33, 28, 34, + 33, 27, 33, 23, 30, 35, 33, 53, 43, 89, 77, 59, 91, 37, 74, 31, + 33, 49, 33, 71, 32, 23, 31, 33, 26, 32, 72, 24, 70, 68, 21, 106, // mode 7, offset 896 + 26, 52, 30, 32, 32, 32, 33, 32, 28, 31, 34, 31, 32, 32, 32, 33, + 34, 47, 34, 44, 32, 27, 30, 33, 32, 29, 89, 28, 46, 89, 20, 107, + 5, 86, 28, 37, 33, 31, 33, 33, 44, 26, 33, 30, 31, 32, 32, 33, + 35, 39, 34, 27, 31, 31, 29, 32, 42, 27, 87, 43, 32, 100, 22, 106, + 26, 24, 30, 34, 32, 33, 33, 33, 92, 35, 38, 31, 30, 32, 32, 33, + 35, 29, 34, 24, 31, 33, 29, 33, 47, 32, 69, 60, 31, 99, 25, 103, + 32, 32, 34, 33, 32, 33, 33, 33, 17, 100, 28, 44, 32, 31, 32, 35, +}; // Weight vectors for MIP size_id 2. diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a8ea46a9..ed9d993f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4636,16 +4636,274 @@ void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, } +// Size ID 0 +void uvg_mip_reduced_pred_sid0_avx2(uvg_pixel* const output, + const int16_t* const input, + const uint16_t* matrix, + const bool transpose, + const int in_offset, + const int in_offset_tr) +{ + const int input_size = 4; + const int pred_size = 4; + const int size_id = 0; + + // Use local buffer for transposed result + uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH + uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; + + int sum = 0; + for (int i = 0; i < input_size; i++) { + sum += input[i]; + } + const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; + + const __m128i vofs = _mm_set1_epi32(offset); + + const uint16_t* weight = matrix; + const int input_offset = transpose ? in_offset_tr : in_offset; + + const __m128i vinofs = _mm_set1_epi32(input_offset); + + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + ); + + const __m128i vinraw = _mm_loadu_si128((__m128i*)input); + const __m128i vin = _mm_shuffle_epi8(vinraw, vshuf); + + // Calculate first half + __m128i vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + __m128i vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + __m128i vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + __m128i vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); + + weight += 32; + + __m128i vmadd0 = _mm_madd_epi16(vin, vweight0); + __m128i vmadd1 = _mm_madd_epi16(vin, vweight1); + __m128i vmadd2 = _mm_madd_epi16(vin, vweight2); + __m128i vmadd3 = _mm_madd_epi16(vin, vweight3); + + __m128i vresult0 = _mm_hadd_epi32(vmadd0, vmadd1); + __m128i vresult1 = _mm_hadd_epi32(vmadd2, vmadd3); + + vresult0 = _mm_add_epi32(vresult0, vofs); + vresult0 = _mm_srai_epi32(vresult0, MIP_SHIFT_MATRIX); + vresult0 = _mm_add_epi32(vresult0, vinofs); + + vresult1 = _mm_add_epi32(vresult1, vofs); + vresult1 = _mm_srai_epi32(vresult1, MIP_SHIFT_MATRIX); + vresult1 = _mm_add_epi32(vresult1, vinofs); + + __m128i vres16_a = _mm_packus_epi32(vresult0, vresult1); + + // Calculate second half + vweight0 = _mm_loadu_si128((__m128i*) & weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) & weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) & weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) & weight[24]); + + vmadd0 = _mm_madd_epi16(vin, vweight0); + vmadd1 = _mm_madd_epi16(vin, vweight1); + vmadd2 = _mm_madd_epi16(vin, vweight2); + vmadd3 = _mm_madd_epi16(vin, vweight3); + + vresult0 = _mm_hadd_epi32(vmadd0, vmadd1); + vresult1 = _mm_hadd_epi32(vmadd2, vmadd3); + + vresult0 = _mm_add_epi32(vresult0, vofs); + vresult0 = _mm_srai_epi32(vresult0, MIP_SHIFT_MATRIX); + vresult0 = _mm_add_epi32(vresult0, vinofs); + + vresult1 = _mm_add_epi32(vresult1, vofs); + vresult1 = _mm_srai_epi32(vresult1, MIP_SHIFT_MATRIX); + vresult1 = _mm_add_epi32(vresult1, vinofs); + + __m128i vres16_b = _mm_packus_epi32(vresult0, vresult1); + __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); + + _mm_storeu_si128((__m128i*)out_ptr, vres8); + + if (transpose) { + for (int y = 0; y < pred_size; y++) { + for (int x = 0; x < pred_size; x++) { + output[y * pred_size + x] = out_ptr[x * pred_size + y]; + } + } + } +} + +// Size ID 1 +void uvg_mip_reduced_pred_sid1_avx2(uvg_pixel* const output, + const int16_t* const input, + const uint16_t* matrix, + const bool transpose, + const int in_offset, + const int in_offset_tr) +{ + const int input_size = 8; + const int pred_size = 4; + const int size_id = 1; + + // Use local buffer for transposed result + uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH + uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; + + int sum = 0; + for (int i = 0; i < input_size; i++) { + sum += input[i]; + } + const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; + + const __m128i vofs = _mm_set1_epi32(offset); + + const uint16_t* weight = matrix; + const int input_offset = transpose ? in_offset_tr : in_offset; + + const __m128i vinofs = _mm_set1_epi32(input_offset); + + const __m128i vshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03); + const __m128i vshuf1 = _mm_setr_epi8( + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07); + const __m128i vshuf2 = _mm_setr_epi8( + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b); + const __m128i vshuf3 = _mm_setr_epi8( + 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f); + + const __m128i vinraw = _mm_loadu_si128((__m128i*)input); + + const __m128i vin0 = _mm_shuffle_epi8(vinraw, vshuf0); + const __m128i vin1 = _mm_shuffle_epi8(vinraw, vshuf1); + const __m128i vin2 = _mm_shuffle_epi8(vinraw, vshuf2); + const __m128i vin3 = _mm_shuffle_epi8(vinraw, vshuf3); + + + for (int y = 0; y < pred_size; y += 2) { + // Calculate row 1, first 4 + __m128i vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + __m128i vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + __m128i vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + __m128i vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); + + __m128i vmadd0 = _mm_madd_epi16(vin0, vweight0); + __m128i vmadd1 = _mm_madd_epi16(vin1, vweight1); + __m128i vmadd2 = _mm_madd_epi16(vin2, vweight2); + __m128i vmadd3 = _mm_madd_epi16(vin3, vweight3); + + __m128i vadd0 = _mm_add_epi32(vmadd0, vmadd1); + __m128i vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + __m128i result0 = _mm_add_epi32(vadd0, vadd1); + + result0 = _mm_add_epi32(result0, vofs); + result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); + result0 = _mm_add_epi32(result0, vinofs); + + weight += input_size * 4; + + // Calculate row 1, last 4 + vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + __m128i result1 = _mm_add_epi32(vadd0, vadd1); + + result1 = _mm_add_epi32(result1, vofs); + result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); + result1 = _mm_add_epi32(result1, vinofs); + + __m128i vres16_a = _mm_packus_epi32(result0, result1); + + weight += input_size * 4; + + // Calculate row 2, first 4 + vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + result0 = _mm_add_epi32(vadd0, vadd1); + + result0 = _mm_add_epi32(result0, vofs); + result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); + result0 = _mm_add_epi32(result0, vinofs); + + weight += input_size * 4; + + // Calculate row 2, last 4 + vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + result1 = _mm_add_epi32(vadd0, vadd1); + + result1 = _mm_add_epi32(result1, vofs); + result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); + result1 = _mm_add_epi32(result1, vinofs); + + __m128i vres16_b = _mm_packus_epi32(result0, result1); + __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); + + _mm_storeu_si128((__m128i*)out_ptr, vres8); + + //out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); + out_ptr += 16; + weight += input_size * 4; + } + + if (transpose) { + for (int y = 0; y < pred_size; y++) { + for (int x = 0; x < pred_size; x++) { + output[y * pred_size + x] = out_ptr[x * pred_size + y]; + } + } + } +} + // Size ID 2 void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, + const int red_pred_size, const bool transpose, const int in_offset, const int in_offset_tr) { const int input_size = 8; - const int pred_size = 8; + const int pred_size = red_pred_size; const int size_id = 2; // Use local buffer for transposed result @@ -4734,10 +4992,10 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, weight += input_size * 4; // Calculate row 2, first 4 - vweight0 = _mm_loadu_si128((__m128i*) & weight[0]); - vweight1 = _mm_loadu_si128((__m128i*) & weight[8]); - vweight2 = _mm_loadu_si128((__m128i*) & weight[16]); - vweight3 = _mm_loadu_si128((__m128i*) & weight[24]); + vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); vmadd0 = _mm_madd_epi16(vin0, vweight0); vmadd1 = _mm_madd_epi16(vin1, vweight1); @@ -4756,10 +5014,10 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, weight += input_size * 4; // Calculate row 2, last 4 - vweight0 = _mm_loadu_si128((__m128i*) & weight[0]); - vweight1 = _mm_loadu_si128((__m128i*) & weight[8]); - vweight2 = _mm_loadu_si128((__m128i*) & weight[16]); - vweight3 = _mm_loadu_si128((__m128i*) & weight[24]); + vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); + vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); + vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); + vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); vmadd0 = _mm_madd_epi16(vin0, vweight0); vmadd1 = _mm_madd_epi16(vin1, vweight1); @@ -4887,12 +5145,9 @@ void mip_predict_avx2( uint16_t ups_hor_factor = width / red_pred_size; uint16_t ups_ver_factor = height / red_pred_size; - // Upsampling factors must be powers of two - assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two."); - assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two."); - // Initialize prediction parameters END + const uvg_pixel* ref_samples_top = &refs->ref.top[1]; const uvg_pixel* ref_samples_left = &refs->ref.left[1]; @@ -4948,10 +5203,10 @@ void mip_predict_avx2( const uint16_t* matrix16 = 0; switch (size_id) { case 0: - matrix = &uvg_mip_matrix_4x4[mode_idx][0][0]; + matrix16 = &uvg_mip_sid0_weights[mode_idx][0][0]; break; case 1: - matrix = &uvg_mip_matrix_8x8[mode_idx][0][0]; + matrix16 = &uvg_mip_sid1_weights[mode_idx * 128]; break; case 2: //matrix = &uvg_mip_matrix_16x16[mode_idx][0][0]; @@ -4969,9 +5224,9 @@ void mip_predict_avx2( const int16_t* const reduced_bdry16 = transpose ? red_bdry_trans16 : red_bdry16; switch (size_id) { - case 0: uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); break; - case 1: uvg_mip_reduced_pred_avx2(reduced_pred, reduced_bdry16, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans); break; - case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; + case 0: uvg_mip_reduced_pred_sid0_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; + case 1: // Size id 1 can use the same function as size id 2 + case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, red_pred_size, transpose, input_offset, input_offset_trans); break; default: assert(false && "Intra MIP: invalid size id.\n"); break; From b0fbbbe8c34698c1913ac5208d04f309d27a0fe5 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 21 Mar 2024 17:17:17 +0200 Subject: [PATCH 112/237] Implement 32x32 mip horizontal upsampling. Reduce unnecessary copy work. --- src/strategies/avx2/intra-avx2.c | 190 +++++++++++++++++++++++++++++-- 1 file changed, 181 insertions(+), 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index ed9d993f..06d07941 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -40,6 +40,7 @@ #if UVG_BIT_DEPTH == 8 #include +#include #include #include #include @@ -5059,6 +5060,95 @@ void uvg_mip_pred_upsampling_1D_avx2(uvg_pixel* const dst, const uvg_pixel* cons const uint8_t dst_step, const uint8_t dst_stride, const uint8_t boundary_step, const uint8_t ups_factor) +{ + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); + const int rounding_offset = 1 << (log2_factor - 1); + + uint16_t idx_orth_dim = 0; + const uvg_pixel* src_line = src; + uvg_pixel* dst_line = dst; + const uvg_pixel* boundary_line = boundary + boundary_step - 1; + while (idx_orth_dim < src_size_orth_dim) + { + uint16_t idx_upsample_dim = 0; + const uvg_pixel* before = boundary_line; + const uvg_pixel* behind = src_line; + uvg_pixel* cur_dst = dst_line; + while (idx_upsample_dim < src_size_ups_dim) + { + uint16_t pos = 1; + int scaled_before = (*before) << log2_factor; + int scaled_behind = 0; + while (pos <= ups_factor) + { + scaled_before -= *before; + scaled_behind += *behind; + *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; + + pos++; + cur_dst += dst_step; + } + + idx_upsample_dim++; + before = behind; + behind += src_step; + } + + idx_orth_dim++; + src_line += src_stride; + dst_line += dst_stride; + boundary_line += boundary_step; + } +} + +void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, + const uint8_t red_pred_size, const uint16_t dst_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) +{ + const uint8_t ref_step = ups_ver_factor; // height / red_pred_size + const uint8_t ups_factor = ups_hor_factor; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); + const int rounding_offset = 1 << (log2_factor - 1); + + uint16_t idx_orth_dim = 0; + const uvg_pixel* src_line = src; + uvg_pixel* dst_line = dst; + const uvg_pixel* ref_line = ref + ref_step - 1; + for (int idx_orth_dim = 0; idx_orth_dim < red_pred_size; ++idx_orth_dim) { + uint16_t idx_upsample_dim = 0; + const uvg_pixel* before = ref_line; + const uvg_pixel* behind = src_line; + uvg_pixel* cur_dst = dst_line; + for (int idx_upsample_dim = 0; idx_upsample_dim < red_pred_size; ++idx_upsample_dim) { + uint16_t pos = 1; + int scaled_before = (*before) << log2_factor; + int scaled_behind = 0; + for (int pos = 0; pos < ups_factor; ++pos) { + scaled_before -= *before; + scaled_behind += *behind; + *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; + + cur_dst++; // Destination step is 1 + } + + before = behind; + behind++; // Source step is 1 + } + + src_line += red_pred_size; // Source stride is same as red_pred_size + dst_line += dst_step; // Destination stride is same as ver_src_step, which is width * ups_ver_factor. Can be as high as 512, must be 16-bit + ref_line += ref_step; + } +} + +void uvg_mip_pred_upsampling_1D_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, + const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, + const uint16_t src_step, const uint8_t src_stride, + const uint8_t dst_step, const uint8_t dst_stride, + const uint8_t boundary_step, + const uint8_t ups_factor) { const int log2_factor = uvg_math_floor_log2(ups_factor); assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); @@ -5102,6 +5192,83 @@ void uvg_mip_pred_upsampling_1D_avx2(uvg_pixel* const dst, const uvg_pixel* cons } +// 32x32, size id 2 hor upscale params [8, 8, 1, 8, 1, 128, 4, 4] +static void mip_upsampling_32x32_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint16_t dst_step = 128; + + const uint8_t ref_step = 4; // height / red_pred_size + const uint8_t ups_factor = 4; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); + const int rounding_offset = 1 << (log2_factor - 1); + + __m128i vshuf0 = _mm_setr_epi8( + 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, + 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f + ); + + + ALIGNED(32) int16_t refs[8]; + ALIGNED(32) int16_t srcs[8]; + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + + int step = ref_step; + + for (int i = 0; i < 8; i++) { + for (int i = 0; i < 8; ++i) { + refs[i] = *ref_ptr; + srcs[i] = *src_ptr; + + ref_ptr += step; + src_ptr += red_pred_size; + } + + __m128i vrnd = _mm_set1_epi16(rounding_offset); + + __m128i vaccu_ref = _mm_load_si128((__m128i*)refs); + __m128i vsub_ref = vaccu_ref; + vaccu_ref = _mm_slli_epi16(vaccu_ref, log2_factor); + + __m128i vaccu_src = _mm_setzero_si128(); + __m128i vadd_src = _mm_load_si128((__m128i*)srcs); + + __m128i vres[4]; + for (int res = 0; res < 4; ++res) { + vaccu_ref = _mm_sub_epi16(vaccu_ref, vsub_ref); + vaccu_src = _mm_add_epi16(vaccu_src, vadd_src); + vres[res] = _mm_add_epi16(vaccu_ref, vaccu_src); + vres[res] = _mm_add_epi16(vres[res], vrnd); + vres[res] = _mm_srli_epi16(vres[res], log2_factor); + } + + __m128i vout0 = _mm_packus_epi16(vres[0], vres[1]); + __m128i vout1 = _mm_packus_epi16(vres[2], vres[3]); + vout0 = _mm_shuffle_epi8(vout0, vshuf0); + vout1 = _mm_shuffle_epi8(vout1, vshuf0); + + __m128i vtmp16lo = _mm_unpacklo_epi16(vout0, vout1); + __m128i vtmp16hi = _mm_unpackhi_epi16(vout0, vout1); + + const int dst_offset = i * 4; + + *(uint32_t*)&dst[dst_offset + dst_step * 0] = _mm_extract_epi32(vtmp16lo, 0); + *(uint32_t*)&dst[dst_offset + dst_step * 1] = _mm_extract_epi32(vtmp16lo, 1); + *(uint32_t*)&dst[dst_offset + dst_step * 2] = _mm_extract_epi32(vtmp16lo, 2); + *(uint32_t*)&dst[dst_offset + dst_step * 3] = _mm_extract_epi32(vtmp16lo, 3); + *(uint32_t*)&dst[dst_offset + dst_step * 4] = _mm_extract_epi32(vtmp16hi, 0); + *(uint32_t*)&dst[dst_offset + dst_step * 5] = _mm_extract_epi32(vtmp16hi, 1); + *(uint32_t*)&dst[dst_offset + dst_step * 6] = _mm_extract_epi32(vtmp16hi, 2); + *(uint32_t*)&dst[dst_offset + dst_step * 7] = _mm_extract_epi32(vtmp16hi, 3); + + ref_ptr = src + i; + src_ptr = src + i + 1; + step = red_pred_size; // Switch ref step + } +} /** \brief Matrix weighted intra prediction. */ @@ -5240,22 +5407,27 @@ void mip_predict_avx2( ver_src = hor_dst; ver_src_step *= ups_ver_factor; - uvg_mip_pred_upsampling_1D_avx2(hor_dst, reduced_pred, ref_samples_left, - red_pred_size, red_pred_size, - 1, red_pred_size, 1, ver_src_step, - ups_ver_factor, ups_hor_factor); + // void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, const uint8_t red_pred_size, const uint8_t ver_src_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) + + // TODO: MIP upscale function pointer table. Test with a single size for now (32x32) + mip_upsampling_32x32_hor_avx2(hor_dst, reduced_pred, ref_samples_left); + } if (ups_ver_factor > 1) { - uvg_mip_pred_upsampling_1D_avx2(result, ver_src, ref_samples_top, - red_pred_size, width, - ver_src_step, 1, width, 1, - 1, ups_ver_factor); + switch (size_id) { + case 0: assert(false && "MIP upscale. Invalid size id.\n"); break; // No upscale is needed for size id 0 + case 1: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + case 2: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + default: + assert(false && "Intra MIP: invalid size id.\n"); + break; + } } } // Assign and cast values from temp array to output - for (int i = 0; i < 32 * 32; i++) { + for (int i = 0; i < width * height; i++) { out[i] = (uvg_pixel)result[i]; } // *** BLOCK PREDICT *** END From f6b6c1418620512ed1d1c78a53f5c8dfe7047d01 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 26 Mar 2024 21:07:47 +0200 Subject: [PATCH 113/237] Increase buffer size to 64x64. Implement width 32 horizontal 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 45 ++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 06d07941..8a708702 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5192,25 +5192,19 @@ void uvg_mip_pred_upsampling_1D_ver_avx2(uvg_pixel* const dst, const uvg_pixel* } -// 32x32, size id 2 hor upscale params [8, 8, 1, 8, 1, 128, 4, 4] -static void mip_upsampling_32x32_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; - const uint16_t dst_step = 128; - - const uint8_t ref_step = 4; // height / red_pred_size const uint8_t ups_factor = 4; // width / red_pred_size const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); const int rounding_offset = 1 << (log2_factor - 1); - __m128i vshuf0 = _mm_setr_epi8( + __m128i vshuf = _mm_setr_epi8( 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f ); - ALIGNED(32) int16_t refs[8]; ALIGNED(32) int16_t srcs[8]; const uvg_pixel* ref_ptr = ref + ref_step - 1; @@ -5219,9 +5213,9 @@ static void mip_upsampling_32x32_hor_avx2(uvg_pixel* const dst, const uvg_pixel* int step = ref_step; for (int i = 0; i < 8; i++) { - for (int i = 0; i < 8; ++i) { - refs[i] = *ref_ptr; - srcs[i] = *src_ptr; + for (int ref = 0; ref < 8; ++ref) { + refs[ref] = *ref_ptr; + srcs[ref] = *src_ptr; ref_ptr += step; src_ptr += red_pred_size; @@ -5247,8 +5241,8 @@ static void mip_upsampling_32x32_hor_avx2(uvg_pixel* const dst, const uvg_pixel* __m128i vout0 = _mm_packus_epi16(vres[0], vres[1]); __m128i vout1 = _mm_packus_epi16(vres[2], vres[3]); - vout0 = _mm_shuffle_epi8(vout0, vshuf0); - vout1 = _mm_shuffle_epi8(vout1, vshuf0); + vout0 = _mm_shuffle_epi8(vout0, vshuf); + vout1 = _mm_shuffle_epi8(vout1, vshuf); __m128i vtmp16lo = _mm_unpacklo_epi16(vout0, vout1); __m128i vtmp16hi = _mm_unpackhi_epi16(vout0, vout1); @@ -5284,7 +5278,7 @@ void mip_predict_avx2( // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative uvg_pixel* out = dst; - uvg_pixel result[32 * 32] = { 0 }; + uvg_pixel result[64 * 64] = { 0 }; const int mode_idx = mip_mode; // *** INPUT PREP *** @@ -5293,6 +5287,9 @@ void mip_predict_avx2( uint16_t width = pred_block_width; uint16_t height = pred_block_height; + int log2x_minus2 = uvg_g_convert_to_log2[width] - 2; + int log2y_minus2 = uvg_g_convert_to_log2[height] - 2; + int size_id; // Prediction block type if (width == 4 && height == 4) { size_id = 0; @@ -5409,9 +5406,23 @@ void mip_predict_avx2( // void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, const uint8_t red_pred_size, const uint8_t ver_src_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) - // TODO: MIP upscale function pointer table. Test with a single size for now (32x32) - mip_upsampling_32x32_hor_avx2(hor_dst, reduced_pred, ref_samples_left); - + switch (width) { + case 4: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + case 8: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + case 16: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + case 32: + if (red_pred_size == 4) { + uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); + } + else { + mip_upsampling_w32_ups4_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. + } + break; + case 64: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + default: + assert(false && "Invalid MIP width.\n"); + break; + } } if (ups_ver_factor > 1) { From fb28ec40e2fd1eec8e6d43bc4e661993fbe31d41 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 27 Mar 2024 18:02:24 +0200 Subject: [PATCH 114/237] Implement 8 width horizontal 1 to 2 upsampling. The result for size 8x4 is correct, but fails with stack corruption. --- src/strategies/avx2/intra-avx2.c | 97 +++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8a708702..c9374e82 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5192,6 +5192,85 @@ void uvg_mip_pred_upsampling_1D_ver_avx2(uvg_pixel* const dst, const uvg_pixel* } +// 8x8, size id 1 hor upscale params [4, 4, 1, 4, 1, 16, 2, 2] +static void mip_upsampling_w8_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 2; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + // Shuffles for result lines 0 and 1 + __m128i vshuf0 = _mm_setr_epi8( + 0xff, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, + 0xff, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07 + ); + + __m128i vshuf1 = _mm_setr_epi8( + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, + 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07 + ); + + // Shuffles for result lines 2 and 3 + __m128i vshuf2 = _mm_setr_epi8( + 0xff, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0xff, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f + ); + + __m128i vshuf3 = _mm_setr_epi8( + 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, + 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f + ); + + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + uvg_pixel ref0 = *(ref + (ref_step * 1) - 1); + uvg_pixel ref1 = *(ref + (ref_step * 2) - 1); + uvg_pixel ref2 = *(ref + (ref_step * 3) - 1); + uvg_pixel ref3 = *(ref + (ref_step * 4) - 1); + + __m128i vsrc = _mm_loadu_si128((__m128i*)src); + + __m128i vadd0 = _mm_shuffle_epi8(vsrc, vshuf0); + __m128i vadd1 = _mm_shuffle_epi8(vsrc, vshuf1); + __m128i vadd2 = _mm_shuffle_epi8(vsrc, vshuf2); + __m128i vadd3 = _mm_shuffle_epi8(vsrc, vshuf3); + + vadd0 = _mm_insert_epi8(vadd0, ref0, 0x00); + vadd0 = _mm_insert_epi8(vadd0, ref1, 0x08); + vadd2 = _mm_insert_epi8(vadd2, ref2, 0x00); + vadd2 = _mm_insert_epi8(vadd2, ref3, 0x08); + + // Extend to 16-bit + __m256i vadd16_0 = _mm256_cvtepu8_epi16(vadd0); + __m256i vadd16_1 = _mm256_cvtepu8_epi16(vadd1); + __m256i vadd16_2 = _mm256_cvtepu8_epi16(vadd2); + __m256i vadd16_3 = _mm256_cvtepu8_epi16(vadd3); + + __m256i vtmp0 = _mm256_add_epi16(vadd16_0, vadd16_1); + __m256i vtmp1 = _mm256_add_epi16(vadd16_2, vadd16_3); + + vtmp0 = _mm256_add_epi16(vtmp0, vrnd); + vtmp1 = _mm256_add_epi16(vtmp1, vrnd); + + vtmp0 = _mm256_srli_epi16(vtmp0, log2_factor); + vtmp1 = _mm256_srli_epi16(vtmp1, log2_factor); + + __m256i vres = _mm256_packus_epi16(vtmp0, vtmp1); + vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); + + if (dst_step == 8) { + _mm256_storeu_si256((__m256i*)dst, vres); + } + else { + *(uint64_t*)&dst[dst_step * 0] = _mm256_extract_epi64(vres, 0); + *(uint64_t*)&dst[dst_step * 1] = _mm256_extract_epi64(vres, 1); + *(uint64_t*)&dst[dst_step * 2] = _mm256_extract_epi64(vres, 2); + *(uint64_t*)&dst[dst_step * 3] = _mm256_extract_epi64(vres, 3); + } +} + static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5205,6 +5284,8 @@ static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f ); + __m128i vrnd = _mm_set1_epi16(rounding_offset); + ALIGNED(32) int16_t refs[8]; ALIGNED(32) int16_t srcs[8]; const uvg_pixel* ref_ptr = ref + ref_step - 1; @@ -5221,8 +5302,6 @@ static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix src_ptr += red_pred_size; } - __m128i vrnd = _mm_set1_epi16(rounding_offset); - __m128i vaccu_ref = _mm_load_si128((__m128i*)refs); __m128i vsub_ref = vaccu_ref; vaccu_ref = _mm_slli_epi16(vaccu_ref, log2_factor); @@ -5287,9 +5366,6 @@ void mip_predict_avx2( uint16_t width = pred_block_width; uint16_t height = pred_block_height; - int log2x_minus2 = uvg_g_convert_to_log2[width] - 2; - int log2y_minus2 = uvg_g_convert_to_log2[height] - 2; - int size_id; // Prediction block type if (width == 4 && height == 4) { size_id = 0; @@ -5311,7 +5387,6 @@ void mip_predict_avx2( // Initialize prediction parameters END - const uvg_pixel* ref_samples_top = &refs->ref.top[1]; const uvg_pixel* ref_samples_left = &refs->ref.left[1]; @@ -5408,7 +5483,15 @@ void mip_predict_avx2( switch (width) { case 4: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; - case 8: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + case 8: + if (red_pred_size == 4) { + //uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); + mip_upsampling_w8_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + } + else { + uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); + } + break; case 16: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; case 32: if (red_pred_size == 4) { From 763122295264d92e4add79682d33c3c34b7ae137 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 28 Mar 2024 15:41:59 +0200 Subject: [PATCH 115/237] Fix stack memory corruption. Memcpy was using sizeof(int) instead of correct sizeof(uvg_pixel). --- src/strategies/avx2/intra-avx2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c9374e82..8fc7f72c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4568,7 +4568,7 @@ void uvg_mip_boundary_downsampling_1D_avx2(uvg_pixel* reduced_dst, const uvg_pix else { // Copy boundary if no downsampling is needed. If this branch is reached, dst_len must be 4 - memcpy(reduced_dst, ref_src, 4 * sizeof(int)); // Copy as much as dst_len indicates + memcpy(reduced_dst, ref_src, 4 * sizeof(uvg_pixel)); // Copy as much as dst_len indicates /*for (uint16_t i = 0; i < dst_len; ++i) { @@ -4944,7 +4944,7 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const __m128i vin2 = _mm_shuffle_epi8(vinraw, vshuf2); const __m128i vin3 = _mm_shuffle_epi8(vinraw, vshuf3); - + // TODO: this does one unnecessary loop for sizes 8x4 and 4x8. Solve this. for (int y = 0; y < pred_size; y += 2) { // Calculate row 1, first 4 __m128i vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); @@ -5384,7 +5384,6 @@ void mip_predict_avx2( // Upsampling factors uint16_t ups_hor_factor = width / red_pred_size; uint16_t ups_ver_factor = height / red_pred_size; - // Initialize prediction parameters END const uvg_pixel* ref_samples_top = &refs->ref.top[1]; From 08a3703789b01177d1edffd0ea720fffa02a3210 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 28 Mar 2024 16:34:06 +0200 Subject: [PATCH 116/237] Implement 64 width horizontal 1 to 8 upsampling. This function can handle all possible height cases, since height 4 is not allowed. --- src/strategies/avx2/intra-avx2.c | 87 +++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8fc7f72c..87f09079 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5343,6 +5343,89 @@ static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 8; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m128i vshuf = _mm_setr_epi8( + 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, + 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f + ); + + __m128i vrnd = _mm_set1_epi16(rounding_offset); + + ALIGNED(32) int16_t refs[8]; + ALIGNED(32) int16_t srcs[8]; + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + + int step = ref_step; + + for (int i = 0; i < 8; i++) { + for (int ref = 0; ref < 8; ++ref) { + refs[ref] = *ref_ptr; + srcs[ref] = *src_ptr; + + ref_ptr += step; + src_ptr += red_pred_size; + } + + __m128i vaccu_ref = _mm_load_si128((__m128i*)refs); + __m128i vsub_ref = vaccu_ref; + vaccu_ref = _mm_slli_epi16(vaccu_ref, log2_factor); + + __m128i vaccu_src = _mm_setzero_si128(); + __m128i vadd_src = _mm_load_si128((__m128i*)srcs); + + __m128i vres[8]; + for (int res = 0; res < 8; ++res) { + vaccu_ref = _mm_sub_epi16(vaccu_ref, vsub_ref); + vaccu_src = _mm_add_epi16(vaccu_src, vadd_src); + vres[res] = _mm_add_epi16(vaccu_ref, vaccu_src); + vres[res] = _mm_add_epi16(vres[res], vrnd); + vres[res] = _mm_srli_epi16(vres[res], log2_factor); + } + + __m128i vout0 = _mm_packus_epi16(vres[0], vres[1]); + __m128i vout1 = _mm_packus_epi16(vres[2], vres[3]); + __m128i vout2 = _mm_packus_epi16(vres[4], vres[5]); + __m128i vout3 = _mm_packus_epi16(vres[6], vres[7]); + vout0 = _mm_shuffle_epi8(vout0, vshuf); + vout1 = _mm_shuffle_epi8(vout1, vshuf); + vout2 = _mm_shuffle_epi8(vout2, vshuf); + vout3 = _mm_shuffle_epi8(vout3, vshuf); + + __m128i vtmp16lo0 = _mm_unpacklo_epi16(vout0, vout1); + __m128i vtmp16hi0 = _mm_unpackhi_epi16(vout0, vout1); + __m128i vtmp16lo1 = _mm_unpacklo_epi16(vout2, vout3); + __m128i vtmp16hi1 = _mm_unpackhi_epi16(vout2, vout3); + + __m128i vtmp32lo0 = _mm_unpacklo_epi32(vtmp16lo0, vtmp16lo1); + __m128i vtmp32hi0 = _mm_unpackhi_epi32(vtmp16lo0, vtmp16lo1); + __m128i vtmp32lo1 = _mm_unpacklo_epi32(vtmp16hi0, vtmp16hi1); + __m128i vtmp32hi1 = _mm_unpackhi_epi32(vtmp16hi0, vtmp16hi1); + + const int dst_offset = i * 8; + + *(uint64_t*)&dst[dst_offset + dst_step * 0] = _mm_extract_epi64(vtmp32lo0, 0); + *(uint64_t*)&dst[dst_offset + dst_step * 1] = _mm_extract_epi64(vtmp32lo0, 1); + *(uint64_t*)&dst[dst_offset + dst_step * 2] = _mm_extract_epi64(vtmp32hi0, 0); + *(uint64_t*)&dst[dst_offset + dst_step * 3] = _mm_extract_epi64(vtmp32hi0, 1); + *(uint64_t*)&dst[dst_offset + dst_step * 4] = _mm_extract_epi64(vtmp32lo1, 0); + *(uint64_t*)&dst[dst_offset + dst_step * 5] = _mm_extract_epi64(vtmp32lo1, 1); + *(uint64_t*)&dst[dst_offset + dst_step * 6] = _mm_extract_epi64(vtmp32hi1, 0); + *(uint64_t*)&dst[dst_offset + dst_step * 7] = _mm_extract_epi64(vtmp32hi1, 1); + + ref_ptr = src + i; + src_ptr = src + i + 1; + step = red_pred_size; // Switch ref step + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -5480,6 +5563,8 @@ void mip_predict_avx2( // void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, const uint8_t red_pred_size, const uint8_t ver_src_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) + //uvg_pixel tmp[64 * 64] = { 0 }; + //uvg_pixel* const tmp_dst = tmp + (ups_ver_factor - 1) * width; switch (width) { case 4: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; case 8: @@ -5500,7 +5585,7 @@ void mip_predict_avx2( mip_upsampling_w32_ups4_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. } break; - case 64: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + case 64: mip_upsampling_w64_ups8_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; default: assert(false && "Invalid MIP width.\n"); break; From 869b587222b8b1fac8da324889a074d37dfb6093 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 2 Apr 2024 16:07:05 +0300 Subject: [PATCH 117/237] Implement alternate version of w32 horizontal 1 to 4 upsampling. This is much faster than the previous version. --- src/strategies/avx2/intra-avx2.c | 116 ++++++++++++++++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 87f09079..a04834d3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5343,6 +5343,120 @@ static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 4; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m128i vshufsrc = _mm_setr_epi8( + 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d + ); + + __m128i vshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03 + ); + + __m128i vshuf1 = _mm_setr_epi8( + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 + ); + + __m128i vshuf2 = _mm_setr_epi8( + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b + ); + + __m128i vshuf3 = _mm_setr_epi8( + 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f + ); + + __m128i vrnd = _mm_set1_epi16(rounding_offset); + __m128i vmul = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4); + + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + + int step = ref_step; + + uvg_pixel* dst_ptr = dst; + + for (int i = 0; i < 8; i++) { + // Handle input data + int16_t before = *ref_ptr; + __m128i vtmp = _mm_loadu_si128((__m128i*)src_ptr); + __m128i vbehind = _mm_cvtepu8_epi16(vtmp); + + __m128i vbefore = vbehind; + vbefore = _mm_shuffle_epi8(vbefore, vshufsrc); + vbefore = _mm_insert_epi16(vbefore, before, 0); + __m128i vbeforeshifted = _mm_slli_epi16(vbefore, log2_factor); + + __m128i vinterpolate = _mm_sub_epi16(vbehind, vbefore); + + // Calculate first half of 32 results, results 0-7 + __m128i vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf0); + + __m128i vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf0); + + __m128i vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + // Calculate first half of 32 results, results 8-15 + __m128i vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf1); + + __m128i vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf1); + + __m128i vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + __m128i vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)dst_ptr, vres); + + // Calculate second half of 32 results, results 16-23 + __m128i vbefore2 = _mm_shuffle_epi8(vbeforeshifted, vshuf2); + + __m128i vinterpolate2 = _mm_shuffle_epi8(vinterpolate, vshuf2); + + __m128i vmulres2 = _mm_mullo_epi16(vinterpolate2, vmul); + vmulres2 = _mm_add_epi16(vmulres2, vbefore2); + + vmulres2 = _mm_add_epi16(vmulres2, vrnd); + vmulres2 = _mm_srai_epi16(vmulres2, log2_factor); + + // Calculate second half of 32 results, results 24-31 + __m128i vbefore3 = _mm_shuffle_epi8(vbeforeshifted, vshuf3); + + __m128i vinterpolate3 = _mm_shuffle_epi8(vinterpolate, vshuf3); + + __m128i vmulres3 = _mm_mullo_epi16(vinterpolate3, vmul); + vmulres3 = _mm_add_epi16(vmulres3, vbefore3); + + vmulres3 = _mm_add_epi16(vmulres3, vrnd); + vmulres3 = _mm_srai_epi16(vmulres3, log2_factor); + + vres = _mm_packus_epi16(vmulres2, vmulres3); + + _mm_store_si128((__m128i*)(dst_ptr + 16), vres); + + dst_ptr += dst_step; + ref_ptr += ref_step; + src_ptr += red_pred_size; + } +} + static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5582,7 +5696,7 @@ void mip_predict_avx2( uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); } else { - mip_upsampling_w32_ups4_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. + mip_upsampling_w32_ups4_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. } break; case 64: mip_upsampling_w64_ups8_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; From b8533e4d2f26e81720ee9465dfa2a5f6e0d25fcc Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 2 Apr 2024 18:19:54 +0300 Subject: [PATCH 118/237] Implement alternate version of w64 horizontal 1 to 8 upsampling. Same idea as alternate w32 also a lot faster than the previous version. --- src/strategies/avx2/intra-avx2.c | 196 +++++++++++++++++++++++++++++-- 1 file changed, 186 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a04834d3..43b5baba 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5260,7 +5260,13 @@ static void mip_upsampling_w8_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixe __m256i vres = _mm256_packus_epi16(vtmp0, vtmp1); vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); - if (dst_step == 8) { + // Dst step is never 8, since this is only called for 8x8 blocks + *(uint64_t*)&dst[dst_step * 0] = _mm256_extract_epi64(vres, 0); + *(uint64_t*)&dst[dst_step * 1] = _mm256_extract_epi64(vres, 1); + *(uint64_t*)&dst[dst_step * 2] = _mm256_extract_epi64(vres, 2); + *(uint64_t*)&dst[dst_step * 3] = _mm256_extract_epi64(vres, 3); + + /*if (dst_step == 8) { _mm256_storeu_si256((__m256i*)dst, vres); } else { @@ -5268,7 +5274,7 @@ static void mip_upsampling_w8_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixe *(uint64_t*)&dst[dst_step * 1] = _mm256_extract_epi64(vres, 1); *(uint64_t*)&dst[dst_step * 2] = _mm256_extract_epi64(vres, 2); *(uint64_t*)&dst[dst_step * 3] = _mm256_extract_epi64(vres, 3); - } + }*/ } static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) @@ -5540,6 +5546,180 @@ static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups8_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 8; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m128i vshufsrc = _mm_setr_epi8( + 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d + ); + + __m128i vshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 + ); + + __m128i vshuf1 = _mm_setr_epi8( + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03 + ); + + __m128i vshuf2 = _mm_setr_epi8( + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05 + ); + + __m128i vshuf3 = _mm_setr_epi8( + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 + ); + + __m128i vshuf4 = _mm_setr_epi8( + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09 + ); + + __m128i vshuf5 = _mm_setr_epi8( + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b + ); + + __m128i vshuf6 = _mm_setr_epi8( + 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, + 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d + ); + + __m128i vshuf7 = _mm_setr_epi8( + 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, + 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f + ); + + __m128i vrnd = _mm_set1_epi16(rounding_offset); + __m128i vmul = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + + int step = ref_step; + + uvg_pixel* dst_ptr = dst; + + for (int i = 0; i < 8; i++) { + // Handle input data + int16_t before = *ref_ptr; + __m128i vtmp = _mm_loadu_si128((__m128i*)src_ptr); + __m128i vbehind = _mm_cvtepu8_epi16(vtmp); + + __m128i vbefore = vbehind; + vbefore = _mm_shuffle_epi8(vbefore, vshufsrc); + vbefore = _mm_insert_epi16(vbefore, before, 0); + __m128i vbeforeshifted = _mm_slli_epi16(vbefore, log2_factor); + + __m128i vinterpolate = _mm_sub_epi16(vbehind, vbefore); + + // Calculate 1st 16 result chunk + __m128i vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf0); + __m128i vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf0); + + __m128i vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + __m128i vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf1); + __m128i vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf1); + + __m128i vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + __m128i vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)(dst_ptr + 0), vres); + + // Calculate 2nd 16 result chunk + vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf2); + vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf2); + + vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf3); + vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf3); + + vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)(dst_ptr + 16), vres); + + // Calculate 3rd 16 result chunk + vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf4); + vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf4); + + vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf5); + vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf5); + + vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)(dst_ptr + 32), vres); + + // Calculate 4th 16 result chunk + vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf6); + vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf6); + + vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf7); + vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf7); + + vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)(dst_ptr + 48), vres); + + dst_ptr += dst_step; + ref_ptr += ref_step; + src_ptr += red_pred_size; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -5682,13 +5862,9 @@ void mip_predict_avx2( switch (width) { case 4: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; case 8: - if (red_pred_size == 4) { - //uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); - mip_upsampling_w8_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); - } - else { - uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); - } + // This will only get called for 8x8 blocks. + mip_upsampling_w8_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + break; case 16: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; case 32: @@ -5699,7 +5875,7 @@ void mip_predict_avx2( mip_upsampling_w32_ups4_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. } break; - case 64: mip_upsampling_w64_ups8_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; + case 64: mip_upsampling_w64_ups8_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; default: assert(false && "Invalid MIP width.\n"); break; From 8688316c5d8d0ac5fa4bce53a61981ed8ddee6f1 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Apr 2024 11:20:42 +0300 Subject: [PATCH 119/237] Clean up swtich case. --- src/strategies/avx2/intra-avx2.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 43b5baba..570142cf 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5860,13 +5860,14 @@ void mip_predict_avx2( //uvg_pixel tmp[64 * 64] = { 0 }; //uvg_pixel* const tmp_dst = tmp + (ups_ver_factor - 1) * width; switch (width) { - case 4: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + // Case 4 does not exist. There is no need for horizontal upsampling when width is 4. case 8: // This will only get called for 8x8 blocks. mip_upsampling_w8_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); - break; - case 16: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); break; + case 16: + uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); + break; case 32: if (red_pred_size == 4) { uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); @@ -5875,7 +5876,9 @@ void mip_predict_avx2( mip_upsampling_w32_ups4_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. } break; - case 64: mip_upsampling_w64_ups8_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; + case 64: + mip_upsampling_w64_ups8_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + break; default: assert(false && "Invalid MIP width.\n"); break; From a96343767221350d64172832f24bccd74936503b Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Apr 2024 14:05:27 +0300 Subject: [PATCH 120/237] Alternate version of w8 horizontal 1 to 2 upsample. --- src/strategies/avx2/intra-avx2.c | 45 +++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 570142cf..ba5f7b55 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5277,6 +5277,49 @@ static void mip_upsampling_w8_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixe }*/ } +static void mip_upsampling_w8_ups2_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 2; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + ALIGNED(16) uint8_t before[17]; + memcpy(&before[1], src_ptr, 16); + before[0] = ref_ptr[ref_step * 0]; + before[4] = ref_ptr[ref_step * 1]; + before[8] = ref_ptr[ref_step * 2]; + before[12] = ref_ptr[ref_step * 3]; + + __m128i vbefore = _mm_load_si128((__m128i*)before); + __m128i vbehind = _mm_load_si128((__m128i*)src_ptr); + + __m128i vavg = _mm_avg_epu8(vbefore, vbehind); + + __m128i vreslo = _mm_unpacklo_epi8(vavg, vbehind); + __m128i vreshi = _mm_unpackhi_epi8(vavg, vbehind); + + // Dst step is never 8, since this is only called for 8x8 blocks + *(uint64_t*)&dst[dst_step * 0] = _mm_extract_epi64(vreslo, 0); + *(uint64_t*)&dst[dst_step * 1] = _mm_extract_epi64(vreslo, 1); + *(uint64_t*)&dst[dst_step * 2] = _mm_extract_epi64(vreshi, 0); + *(uint64_t*)&dst[dst_step * 3] = _mm_extract_epi64(vreshi, 1); + + /*if (dst_step == 8) { + _mm256_storeu_si256((__m256i*)dst, vres); + } + else { + *(uint64_t*)&dst[dst_step * 0] = _mm256_extract_epi64(vres, 0); + *(uint64_t*)&dst[dst_step * 1] = _mm256_extract_epi64(vres, 1); + *(uint64_t*)&dst[dst_step * 2] = _mm256_extract_epi64(vres, 2); + *(uint64_t*)&dst[dst_step * 3] = _mm256_extract_epi64(vres, 3); + }*/ +} + static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5863,7 +5906,7 @@ void mip_predict_avx2( // Case 4 does not exist. There is no need for horizontal upsampling when width is 4. case 8: // This will only get called for 8x8 blocks. - mip_upsampling_w8_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + mip_upsampling_w8_ups2_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; case 16: uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); From 866b7d358afe8b3097ab6291b8a98bf27347586d Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Apr 2024 15:33:08 +0300 Subject: [PATCH 121/237] Implement horizontal w16 1 to 2 and w16 1 to 4 upsampling. These use Joose's technique using avg_epu8. Implement w32 horizontal 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 232 ++++++++++++++++++++++++++++++- 1 file changed, 227 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index ba5f7b55..cc824bb0 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5320,6 +5320,117 @@ static void mip_upsampling_w8_ups2_hor_avx2_alt(uvg_pixel* const dst, const uvg_ }*/ } +static void mip_upsampling_w16_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 2; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + for (int i = 0; i < 2; ++i) { + ALIGNED(32) uint8_t before[33]; + memcpy(&before[1], src_ptr, 32); + before[0] = ref_ptr[ref_step * 0]; + before[8] = ref_ptr[ref_step * 1]; + before[16] = ref_ptr[ref_step * 2]; + before[24] = ref_ptr[ref_step * 3]; + + __m256i vbefore = _mm256_load_si256((__m256i*)before); + __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + + __m256i vavg = _mm256_avg_epu8(vbefore, vbehind); + + __m256i vreslo = _mm256_unpacklo_epi8(vavg, vbehind); + __m256i vreshi = _mm256_unpackhi_epi8(vavg, vbehind); + + _mm_store_si128((__m128i*) & dst_ptr[dst_step * 0], _mm256_extracti128_si256(vreslo, 0)); + _mm_store_si128((__m128i*) & dst_ptr[dst_step * 1], _mm256_extracti128_si256(vreshi, 0)); + _mm_store_si128((__m128i*) & dst_ptr[dst_step * 2], _mm256_extracti128_si256(vreslo, 1)); + _mm_store_si128((__m128i*) & dst_ptr[dst_step * 3], _mm256_extracti128_si256(vreshi, 1)); + + src_ptr += 32; + dst_ptr += dst_step * 4; + ref_ptr += ref_step * 4; + } +} + +static void mip_upsampling_w16_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 4; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + int step = ref_step; + __m128i ones = _mm_set1_epi8(1); + __m128i threes = _mm_set1_epi8(3); + + __m256i permute_mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + + // Assign references by hand after copying sources. This will avoid the use of inserts later. + // Before buffer length is 33 since we need to copy reference value into the first index. + // Copying 32 samples is faster than copying 31. First indices of each 8 wide row will be replaced + // with a reference value. + ALIGNED(16) uint8_t before[17]; + memcpy(&before[1], src_ptr, 16); + before[0] = ref_ptr[ref_step * 0]; + before[4] = ref_ptr[ref_step * 1]; + before[8] = ref_ptr[ref_step * 2]; + before[12] = ref_ptr[ref_step * 3]; + + __m128i vbefore = _mm_load_si128((__m128i*)before); + __m128i vbehind = _mm_load_si128((__m128i*)src_ptr); + + // Permute the input values to get the result in correct order. + //vbefore = _mm256_permutevar8x32_epi32(vbefore, permute_mask); + //vbehind = _mm256_permutevar8x32_epi32(vbehind, permute_mask); + + // Calculate the 3 interpolated values between before and behind, middle, left and right. + __m128i vmiddle = _mm_avg_epu8(vbefore, vbehind); + __m128i vleft = _mm_avg_epu8(vmiddle, vbefore); + __m128i vright = _mm_avg_epu8(vmiddle, vbehind); + + // Calculate the two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + // Rounding error occurs in the left interpolated value if the two last bits of the difference between before and behind is 0b01. + __m128i diff = _mm_sub_epi8(vbehind, vbefore); + diff = _mm_and_si128(diff, threes); + __m128i mask = _mm_cmpeq_epi8(diff, ones); // The rounding error mask will be generated based on the calculated last bits. + __m128i sub_amount = _mm_blendv_epi8(_mm_set1_epi8(0), ones, mask); + + vleft = _mm_sub_epi8(vleft, sub_amount); + + // Same rounding error handling for right interpolated values. + // Error happens if the two last bits of the difference between before and behind is 0b11. + mask = _mm_cmpeq_epi8(diff, threes); + sub_amount = _mm_blendv_epi8(_mm_set1_epi8(0), ones, mask); + + vright = _mm_sub_epi8(vright, sub_amount); + + // Interleave results. + __m128i left_temp0 = _mm_unpacklo_epi8(vleft, vmiddle); + __m128i left_temp1 = _mm_unpackhi_epi8(vleft, vmiddle); + __m128i right_temp0 = _mm_unpacklo_epi8(vright, vbehind); + __m128i right_temp1 = _mm_unpackhi_epi8(vright, vbehind); + + __m128i vtmp0 = _mm_unpacklo_epi16(left_temp0, right_temp0); + __m128i vtmp1 = _mm_unpackhi_epi16(left_temp0, right_temp0); + __m128i vtmp2 = _mm_unpacklo_epi16(left_temp1, right_temp1); + __m128i vtmp3 = _mm_unpackhi_epi16(left_temp1, right_temp1); + + _mm_store_si128((__m128i*)(dst_ptr + dst_step * 0), vtmp0); + _mm_store_si128((__m128i*)(dst_ptr + dst_step * 1), vtmp1); + _mm_store_si128((__m128i*)(dst_ptr + dst_step * 2), vtmp2); + _mm_store_si128((__m128i*)(dst_ptr + dst_step * 3), vtmp3); +} + static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5506,6 +5617,114 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg } } +static void mip_upsampling_w32_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 8; // width / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m128i vshufsrc = _mm_setr_epi8( + 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d + ); + + __m128i vshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 + ); + + __m128i vshuf1 = _mm_setr_epi8( + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03 + ); + + __m128i vshuf2 = _mm_setr_epi8( + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05 + ); + + __m128i vshuf3 = _mm_setr_epi8( + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 + ); + + __m128i vrnd = _mm_set1_epi16(rounding_offset); + __m128i vmul = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + const uvg_pixel* ref_ptr = ref + ref_step - 1; + const uvg_pixel* src_ptr = src; + + int step = ref_step; + + uvg_pixel* dst_ptr = dst; + + for (int i = 0; i < 8; i++) { + // Handle input data + int16_t before = *ref_ptr; + __m128i vtmp = _mm_loadu_si128((__m128i*)src_ptr); + __m128i vbehind = _mm_cvtepu8_epi16(vtmp); + + __m128i vbefore = vbehind; + vbefore = _mm_shuffle_epi8(vbefore, vshufsrc); + vbefore = _mm_insert_epi16(vbefore, before, 0); + __m128i vbeforeshifted = _mm_slli_epi16(vbefore, log2_factor); + + __m128i vinterpolate = _mm_sub_epi16(vbehind, vbefore); + + // Calculate 1st 16 result chunk + __m128i vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf0); + __m128i vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf0); + + __m128i vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + __m128i vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf1); + __m128i vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf1); + + __m128i vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + __m128i vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)(dst_ptr + 0), vres); + + // Calculate 2nd 16 result chunk + vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf2); + vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf2); + + vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); + vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + + vmulres0 = _mm_add_epi16(vmulres0, vrnd); + vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + + vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf3); + vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf3); + + vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); + vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + + vmulres1 = _mm_add_epi16(vmulres1, vrnd); + vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + + vres = _mm_packus_epi16(vmulres0, vmulres1); + + _mm_store_si128((__m128i*)(dst_ptr + 16), vres); + + dst_ptr += dst_step; + ref_ptr += ref_step; + src_ptr += red_pred_size; + } +} + static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5900,8 +6119,6 @@ void mip_predict_avx2( // void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, const uint8_t red_pred_size, const uint8_t ver_src_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) - //uvg_pixel tmp[64 * 64] = { 0 }; - //uvg_pixel* const tmp_dst = tmp + (ups_ver_factor - 1) * width; switch (width) { // Case 4 does not exist. There is no need for horizontal upsampling when width is 4. case 8: @@ -5909,18 +6126,23 @@ void mip_predict_avx2( mip_upsampling_w8_ups2_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; case 16: - uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); + if (red_pred_size == 4) { + mip_upsampling_w16_ups4_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + } + else { + mip_upsampling_w16_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 2. + } break; case 32: if (red_pred_size == 4) { - uvg_mip_pred_upsampling_1D_hor_avx2(hor_dst, reduced_pred, ref_samples_left, red_pred_size, ver_src_step, ups_ver_factor, ups_hor_factor); + mip_upsampling_w32_ups8_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); } else { mip_upsampling_w32_ups4_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. } break; case 64: - mip_upsampling_w64_ups8_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + mip_upsampling_w64_ups8_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 8. break; default: assert(false && "Invalid MIP width.\n"); From 053526650219caa9a1c9bce9689b4d92e83198e2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 3 Apr 2024 08:55:11 +0300 Subject: [PATCH 122/237] Alternate version of mip_upsampling_w32_ups4_hor_avx2 using _mm256_avg_epu8 --- src/strategies/avx2/intra-avx2.c | 169 +++++++++++++++---------------- 1 file changed, 80 insertions(+), 89 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index cc824bb0..1104008f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5503,6 +5503,7 @@ static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix } } + static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5511,112 +5512,102 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg const int log2_factor = uvg_g_convert_to_log2[ups_factor]; const int rounding_offset = 1 << (log2_factor - 1); - __m128i vshufsrc = _mm_setr_epi8( - 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, - 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d - ); - - __m128i vshuf0 = _mm_setr_epi8( - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, - 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03 - ); - - __m128i vshuf1 = _mm_setr_epi8( - 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, - 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 - ); - - __m128i vshuf2 = _mm_setr_epi8( - 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, - 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b - ); - - __m128i vshuf3 = _mm_setr_epi8( - 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, - 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f + __m128i vshuf = _mm_setr_epi8( + 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, + 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f ); __m128i vrnd = _mm_set1_epi16(rounding_offset); - __m128i vmul = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4); + ALIGNED(32) int16_t refs[8]; + ALIGNED(32) int16_t srcs[8]; const uvg_pixel* ref_ptr = ref + ref_step - 1; const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; int step = ref_step; + __m256i ones = _mm256_set1_epi8(1); + __m256i threes = _mm256_set1_epi8(3); - uvg_pixel* dst_ptr = dst; - for (int i = 0; i < 8; i++) { - // Handle input data - int16_t before = *ref_ptr; - __m128i vtmp = _mm_loadu_si128((__m128i*)src_ptr); - __m128i vbehind = _mm_cvtepu8_epi16(vtmp); - - __m128i vbefore = vbehind; - vbefore = _mm_shuffle_epi8(vbefore, vshufsrc); - vbefore = _mm_insert_epi16(vbefore, before, 0); - __m128i vbeforeshifted = _mm_slli_epi16(vbefore, log2_factor); - - __m128i vinterpolate = _mm_sub_epi16(vbehind, vbefore); + for (int i = 0; i < 32 * 32; i += 512) { - // Calculate first half of 32 results, results 0-7 - __m128i vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf0); - - __m128i vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf0); - - __m128i vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); - vmulres0 = _mm_add_epi16(vmulres0, vbefore0); - - vmulres0 = _mm_add_epi16(vmulres0, vrnd); - vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); - - // Calculate first half of 32 results, results 8-15 - __m128i vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf1); - - __m128i vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf1); - - __m128i vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); - vmulres1 = _mm_add_epi16(vmulres1, vbefore1); - - vmulres1 = _mm_add_epi16(vmulres1, vrnd); - vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); - - __m128i vres = _mm_packus_epi16(vmulres0, vmulres1); - - _mm_store_si128((__m128i*)dst_ptr, vres); - - // Calculate second half of 32 results, results 16-23 - __m128i vbefore2 = _mm_shuffle_epi8(vbeforeshifted, vshuf2); - - __m128i vinterpolate2 = _mm_shuffle_epi8(vinterpolate, vshuf2); - - __m128i vmulres2 = _mm_mullo_epi16(vinterpolate2, vmul); - vmulres2 = _mm_add_epi16(vmulres2, vbefore2); - - vmulres2 = _mm_add_epi16(vmulres2, vrnd); - vmulres2 = _mm_srai_epi16(vmulres2, log2_factor); - - // Calculate second half of 32 results, results 24-31 - __m128i vbefore3 = _mm_shuffle_epi8(vbeforeshifted, vshuf3); - - __m128i vinterpolate3 = _mm_shuffle_epi8(vinterpolate, vshuf3); - - __m128i vmulres3 = _mm_mullo_epi16(vinterpolate3, vmul); - vmulres3 = _mm_add_epi16(vmulres3, vbefore3); - - vmulres3 = _mm_add_epi16(vmulres3, vrnd); - vmulres3 = _mm_srai_epi16(vmulres3, log2_factor); + ALIGNED(32) uint8_t before[33]; + memcpy(&before[1], src_ptr, 32); + before[0] = ref_ptr[0]; + // memcpy(&before[1], src_ptr, 7 * sizeof(uint8_t)); + before[8] = ref_ptr[4]; + // memcpy(&before[9], src_ptr + 8, 7 * sizeof(uint8_t)); + before[16] = ref_ptr[8]; + // memcpy(&before[17], src_ptr + 16, 7 * sizeof(uint8_t)); + before[24] = ref_ptr[12]; + // memcpy(&before[25], src_ptr + 24, 7 * sizeof(uint8_t)); - vres = _mm_packus_epi16(vmulres2, vmulres3); + __m256i vbefore = _mm256_load_si256((__m256i*)before); + __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); - _mm_store_si128((__m128i*)(dst_ptr + 16), vres); + __m256i vmiddle = _mm256_avg_epu8(vbefore, vbehind); + __m256i vleft = _mm256_avg_epu8(vmiddle, vbefore); + __m256i vright = _mm256_avg_epu8(vmiddle, vbehind); + + __m256i diff = _mm256_sub_epi8(vbehind, vbefore); + diff = _mm256_and_si256(diff, threes); + __m256i mask = _mm256_cmpeq_epi8(diff, ones); + __m256i sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); + + vleft = _mm256_sub_epi8(vleft, sub_amount); + + mask = _mm256_cmpeq_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); + + vright = _mm256_sub_epi8(vright, sub_amount); + + __m256i left_temp0 = _mm256_unpacklo_epi8(vleft, vmiddle); + __m256i left_temp1 = _mm256_unpackhi_epi8(vleft, vmiddle); + __m256i right_temp0 = _mm256_unpacklo_epi8(vright, vbehind); + __m256i right_temp1 = _mm256_unpackhi_epi8(vright, vbehind); + //left_temp0 = _mm256_permute4x64_epi64(left_temp0, _MM_SHUFFLE(3, 1, 2, 0)); + //left_temp1 = _mm256_permute4x64_epi64(left_temp1, _MM_SHUFFLE(3, 1, 2, 0)); + //right_temp0 = _mm256_permute4x64_epi64(right_temp0, _MM_SHUFFLE(3, 1, 2, 0)); + //right_temp1 = _mm256_permute4x64_epi64(right_temp1, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i vtmp0 = _mm256_unpacklo_epi16(left_temp0, right_temp0); + __m256i vtmp1 = _mm256_unpackhi_epi16(left_temp0, right_temp0); + __m256i vtmp2 = _mm256_unpacklo_epi16(left_temp1, right_temp1); + __m256i vtmp3 = _mm256_unpackhi_epi16(left_temp1, right_temp1); + //vtmp0 = _mm256_permute4x64_epi64(vtmp0, _MM_SHUFFLE(3, 1, 2, 0)); + //vtmp1 = _mm256_permute4x64_epi64(vtmp1, _MM_SHUFFLE(3, 1, 2, 0)); + //vtmp2 = _mm256_permute4x64_epi64(vtmp2, _MM_SHUFFLE(3, 1, 2, 0)); + //vtmp3 = _mm256_permute4x64_epi64(vtmp3, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i vtmp0_lo = _mm256_castsi256_si128(vtmp0); + __m128i vtmp0_hi = _mm256_extracti128_si256(vtmp0, 1); + __m128i vtmp1_lo = _mm256_castsi256_si128(vtmp1); + __m128i vtmp1_hi = _mm256_extracti128_si256(vtmp1, 1); + __m128i vtmp2_lo = _mm256_castsi256_si128(vtmp2); + __m128i vtmp2_hi = _mm256_extracti128_si256(vtmp2, 1); + __m128i vtmp3_lo = _mm256_castsi256_si128(vtmp3); + __m128i vtmp3_hi = _mm256_extracti128_si256(vtmp3, 1); + + _mm_store_si128((__m128i*)dst_ptr, vtmp0_lo); + _mm_store_si128((__m128i*)dst_ptr + 1, vtmp1_lo); + + _mm_store_si128((__m128i*)dst_ptr + 8, vtmp2_lo); + _mm_store_si128((__m128i*)dst_ptr + 9, vtmp3_lo); + + _mm_store_si128((__m128i*)dst_ptr + 16, vtmp0_hi); + _mm_store_si128((__m128i*)dst_ptr + 17, vtmp1_hi); + + _mm_store_si128((__m128i*)dst_ptr + 24, vtmp2_hi); + _mm_store_si128((__m128i*)dst_ptr + 25, vtmp3_hi); - dst_ptr += dst_step; - ref_ptr += ref_step; - src_ptr += red_pred_size; + src_ptr += 32; + ref_ptr += 16; + dst_ptr += 512; } } + + static void mip_upsampling_w32_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 4; From bb5561ea6e16d1f6a8a82d0b7869a65451233cda Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 3 Apr 2024 09:39:42 +0300 Subject: [PATCH 123/237] Reorder inputs instead of outputs --- src/strategies/avx2/intra-avx2.c | 53 ++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1104008f..e17ab8fe 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5530,6 +5530,9 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i threes = _mm256_set1_epi8(3); + __m256i permute_mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + + for (int i = 0; i < 32 * 32; i += 512) { ALIGNED(32) uint8_t before[33]; @@ -5546,6 +5549,9 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i vbefore = _mm256_load_si256((__m256i*)before); __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + vbefore = _mm256_permutevar8x32_epi32(vbefore, permute_mask); + vbehind = _mm256_permutevar8x32_epi32(vbehind, permute_mask); + __m256i vmiddle = _mm256_avg_epu8(vbefore, vbehind); __m256i vleft = _mm256_avg_epu8(vmiddle, vbefore); __m256i vright = _mm256_avg_epu8(vmiddle, vbehind); @@ -5575,30 +5581,37 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i vtmp1 = _mm256_unpackhi_epi16(left_temp0, right_temp0); __m256i vtmp2 = _mm256_unpacklo_epi16(left_temp1, right_temp1); __m256i vtmp3 = _mm256_unpackhi_epi16(left_temp1, right_temp1); + + _mm256_store_si256((__m256i*)dst_ptr, vtmp0); + _mm256_store_si256((__m256i*)dst_ptr+4, vtmp1); + _mm256_store_si256((__m256i*)dst_ptr+8, vtmp2); + _mm256_store_si256((__m256i*)dst_ptr+12, vtmp3); + + //vtmp0 = _mm256_permute4x64_epi64(vtmp0, _MM_SHUFFLE(3, 1, 2, 0)); //vtmp1 = _mm256_permute4x64_epi64(vtmp1, _MM_SHUFFLE(3, 1, 2, 0)); //vtmp2 = _mm256_permute4x64_epi64(vtmp2, _MM_SHUFFLE(3, 1, 2, 0)); //vtmp3 = _mm256_permute4x64_epi64(vtmp3, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i vtmp0_lo = _mm256_castsi256_si128(vtmp0); - __m128i vtmp0_hi = _mm256_extracti128_si256(vtmp0, 1); - __m128i vtmp1_lo = _mm256_castsi256_si128(vtmp1); - __m128i vtmp1_hi = _mm256_extracti128_si256(vtmp1, 1); - __m128i vtmp2_lo = _mm256_castsi256_si128(vtmp2); - __m128i vtmp2_hi = _mm256_extracti128_si256(vtmp2, 1); - __m128i vtmp3_lo = _mm256_castsi256_si128(vtmp3); - __m128i vtmp3_hi = _mm256_extracti128_si256(vtmp3, 1); - - _mm_store_si128((__m128i*)dst_ptr, vtmp0_lo); - _mm_store_si128((__m128i*)dst_ptr + 1, vtmp1_lo); - - _mm_store_si128((__m128i*)dst_ptr + 8, vtmp2_lo); - _mm_store_si128((__m128i*)dst_ptr + 9, vtmp3_lo); - - _mm_store_si128((__m128i*)dst_ptr + 16, vtmp0_hi); - _mm_store_si128((__m128i*)dst_ptr + 17, vtmp1_hi); - - _mm_store_si128((__m128i*)dst_ptr + 24, vtmp2_hi); - _mm_store_si128((__m128i*)dst_ptr + 25, vtmp3_hi); + //__m128i vtmp0_lo = _mm256_castsi256_si128(vtmp0); + //__m128i vtmp0_hi = _mm256_extracti128_si256(vtmp0, 1); + //__m128i vtmp1_lo = _mm256_castsi256_si128(vtmp1); + //__m128i vtmp1_hi = _mm256_extracti128_si256(vtmp1, 1); + //__m128i vtmp2_lo = _mm256_castsi256_si128(vtmp2); + //__m128i vtmp2_hi = _mm256_extracti128_si256(vtmp2, 1); + //__m128i vtmp3_lo = _mm256_castsi256_si128(vtmp3); + //__m128i vtmp3_hi = _mm256_extracti128_si256(vtmp3, 1); + + //_mm_store_si128((__m128i*)dst_ptr, vtmp0_lo); + //_mm_store_si128((__m128i*)dst_ptr + 1, vtmp1_lo); + + //_mm_store_si128((__m128i*)dst_ptr + 8, vtmp2_lo); + //_mm_store_si128((__m128i*)dst_ptr + 9, vtmp3_lo); + + //_mm_store_si128((__m128i*)dst_ptr + 16, vtmp0_hi); + //_mm_store_si128((__m128i*)dst_ptr + 17, vtmp1_hi); + + //_mm_store_si128((__m128i*)dst_ptr + 24, vtmp2_hi); + //_mm_store_si128((__m128i*)dst_ptr + 25, vtmp3_hi); src_ptr += 32; ref_ptr += 16; From 3afaec6307ed79e2dc90c30aae5d498af9179845 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Apr 2024 11:17:51 +0300 Subject: [PATCH 124/237] Implement Joose's alt function to work with other heights than 32. Height 4 does not work with this functions since upscale factor is different. --- src/strategies/avx2/intra-avx2.c | 34 +++++++++++--------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e17ab8fe..930f3061 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5537,14 +5537,11 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg ALIGNED(32) uint8_t before[33]; memcpy(&before[1], src_ptr, 32); - before[0] = ref_ptr[0]; - // memcpy(&before[1], src_ptr, 7 * sizeof(uint8_t)); - before[8] = ref_ptr[4]; - // memcpy(&before[9], src_ptr + 8, 7 * sizeof(uint8_t)); - before[16] = ref_ptr[8]; - // memcpy(&before[17], src_ptr + 16, 7 * sizeof(uint8_t)); - before[24] = ref_ptr[12]; - // memcpy(&before[25], src_ptr + 24, 7 * sizeof(uint8_t)); + before[0] = ref_ptr[ref_step * 0]; + before[8] = ref_ptr[ref_step * 1]; + before[16] = ref_ptr[ref_step * 2]; + before[24] = ref_ptr[ref_step * 3]; + __m256i vbefore = _mm256_load_si256((__m256i*)before); __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); @@ -5572,26 +5569,17 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i left_temp1 = _mm256_unpackhi_epi8(vleft, vmiddle); __m256i right_temp0 = _mm256_unpacklo_epi8(vright, vbehind); __m256i right_temp1 = _mm256_unpackhi_epi8(vright, vbehind); - //left_temp0 = _mm256_permute4x64_epi64(left_temp0, _MM_SHUFFLE(3, 1, 2, 0)); - //left_temp1 = _mm256_permute4x64_epi64(left_temp1, _MM_SHUFFLE(3, 1, 2, 0)); - //right_temp0 = _mm256_permute4x64_epi64(right_temp0, _MM_SHUFFLE(3, 1, 2, 0)); - //right_temp1 = _mm256_permute4x64_epi64(right_temp1, _MM_SHUFFLE(3, 1, 2, 0)); __m256i vtmp0 = _mm256_unpacklo_epi16(left_temp0, right_temp0); __m256i vtmp1 = _mm256_unpackhi_epi16(left_temp0, right_temp0); __m256i vtmp2 = _mm256_unpacklo_epi16(left_temp1, right_temp1); __m256i vtmp3 = _mm256_unpackhi_epi16(left_temp1, right_temp1); - _mm256_store_si256((__m256i*)dst_ptr, vtmp0); - _mm256_store_si256((__m256i*)dst_ptr+4, vtmp1); - _mm256_store_si256((__m256i*)dst_ptr+8, vtmp2); - _mm256_store_si256((__m256i*)dst_ptr+12, vtmp3); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 0), vtmp0); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 1), vtmp1); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 2), vtmp2); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 3), vtmp3); - - //vtmp0 = _mm256_permute4x64_epi64(vtmp0, _MM_SHUFFLE(3, 1, 2, 0)); - //vtmp1 = _mm256_permute4x64_epi64(vtmp1, _MM_SHUFFLE(3, 1, 2, 0)); - //vtmp2 = _mm256_permute4x64_epi64(vtmp2, _MM_SHUFFLE(3, 1, 2, 0)); - //vtmp3 = _mm256_permute4x64_epi64(vtmp3, _MM_SHUFFLE(3, 1, 2, 0)); //__m128i vtmp0_lo = _mm256_castsi256_si128(vtmp0); //__m128i vtmp0_hi = _mm256_extracti128_si256(vtmp0, 1); //__m128i vtmp1_lo = _mm256_castsi256_si128(vtmp1); @@ -5614,8 +5602,8 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg //_mm_store_si128((__m128i*)dst_ptr + 25, vtmp3_hi); src_ptr += 32; - ref_ptr += 16; - dst_ptr += 512; + ref_ptr += ref_step * 4; + dst_ptr += dst_step * 4; } } From 5421fd87fb6b3717d26cab2f1ba323a413e9f12d Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Apr 2024 11:30:33 +0300 Subject: [PATCH 125/237] Add comments to clarify the functionality of alt version. --- src/strategies/avx2/intra-avx2.c | 34 +++++++++++--------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 930f3061..4f7a02e3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5533,8 +5533,10 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i permute_mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); - for (int i = 0; i < 32 * 32; i += 512) { + // This will process 4 rows at a time. Limit is always 8 rows. + for (int i = 0; i < 2; ++i) { + // Assign references by hand after copying sources. This will avoid the use of inserts later. ALIGNED(32) uint8_t before[33]; memcpy(&before[1], src_ptr, 32); before[0] = ref_ptr[ref_step * 0]; @@ -5546,25 +5548,32 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i vbefore = _mm256_load_si256((__m256i*)before); __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + // Permute the input values to get the result in correct order. vbefore = _mm256_permutevar8x32_epi32(vbefore, permute_mask); vbehind = _mm256_permutevar8x32_epi32(vbehind, permute_mask); + // Calculate the 3 interpolated values between before and behind, middle, left and right. __m256i vmiddle = _mm256_avg_epu8(vbefore, vbehind); __m256i vleft = _mm256_avg_epu8(vmiddle, vbefore); __m256i vright = _mm256_avg_epu8(vmiddle, vbehind); + // Calculate the two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + // Rounding error occurs in the left interpolated value if the two last bits of the difference between before and behind is 0b01. __m256i diff = _mm256_sub_epi8(vbehind, vbefore); diff = _mm256_and_si256(diff, threes); - __m256i mask = _mm256_cmpeq_epi8(diff, ones); + __m256i mask = _mm256_cmpeq_epi8(diff, ones); // The rounding error mask will be generated based on the calculated last bits. __m256i sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); vleft = _mm256_sub_epi8(vleft, sub_amount); + // Same rounding error handling for right interpolated values. + // Error happens if the two last bits of the difference between before and behind is 0b11. mask = _mm256_cmpeq_epi8(diff, threes); sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); vright = _mm256_sub_epi8(vright, sub_amount); + // Interleave results. __m256i left_temp0 = _mm256_unpacklo_epi8(vleft, vmiddle); __m256i left_temp1 = _mm256_unpackhi_epi8(vleft, vmiddle); __m256i right_temp0 = _mm256_unpacklo_epi8(vright, vbehind); @@ -5580,27 +5589,6 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 2), vtmp2); _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 3), vtmp3); - //__m128i vtmp0_lo = _mm256_castsi256_si128(vtmp0); - //__m128i vtmp0_hi = _mm256_extracti128_si256(vtmp0, 1); - //__m128i vtmp1_lo = _mm256_castsi256_si128(vtmp1); - //__m128i vtmp1_hi = _mm256_extracti128_si256(vtmp1, 1); - //__m128i vtmp2_lo = _mm256_castsi256_si128(vtmp2); - //__m128i vtmp2_hi = _mm256_extracti128_si256(vtmp2, 1); - //__m128i vtmp3_lo = _mm256_castsi256_si128(vtmp3); - //__m128i vtmp3_hi = _mm256_extracti128_si256(vtmp3, 1); - - //_mm_store_si128((__m128i*)dst_ptr, vtmp0_lo); - //_mm_store_si128((__m128i*)dst_ptr + 1, vtmp1_lo); - - //_mm_store_si128((__m128i*)dst_ptr + 8, vtmp2_lo); - //_mm_store_si128((__m128i*)dst_ptr + 9, vtmp3_lo); - - //_mm_store_si128((__m128i*)dst_ptr + 16, vtmp0_hi); - //_mm_store_si128((__m128i*)dst_ptr + 17, vtmp1_hi); - - //_mm_store_si128((__m128i*)dst_ptr + 24, vtmp2_hi); - //_mm_store_si128((__m128i*)dst_ptr + 25, vtmp3_hi); - src_ptr += 32; ref_ptr += ref_step * 4; dst_ptr += dst_step * 4; From e561992560e044b79733d731331e06140a26cf03 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 3 Apr 2024 12:35:19 +0300 Subject: [PATCH 126/237] Clean up some leftovers from alt function. Add more clarifying comments. --- src/strategies/avx2/intra-avx2.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 4f7a02e3..abe8217f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5510,17 +5510,7 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg const uint8_t ups_factor = 4; // width / red_pred_size const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - - __m128i vshuf = _mm_setr_epi8( - 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, - 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f - ); - __m128i vrnd = _mm_set1_epi16(rounding_offset); - - ALIGNED(32) int16_t refs[8]; - ALIGNED(32) int16_t srcs[8]; const uvg_pixel* ref_ptr = ref + ref_step - 1; const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; @@ -5529,14 +5519,15 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i ones = _mm256_set1_epi8(1); __m256i threes = _mm256_set1_epi8(3); - __m256i permute_mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); - // This will process 4 rows at a time. Limit is always 8 rows. for (int i = 0; i < 2; ++i) { // Assign references by hand after copying sources. This will avoid the use of inserts later. + // Before buffer length is 33 since we need to copy reference value into the first index. + // Copying 32 samples is faster than copying 31. First indices of each 8 wide row will be replaced + // with a reference value. ALIGNED(32) uint8_t before[33]; memcpy(&before[1], src_ptr, 32); before[0] = ref_ptr[ref_step * 0]; From dff4b0f11f7b9c234815dd96813cbf873d628699 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 4 Apr 2024 08:58:07 +0300 Subject: [PATCH 127/237] alternate version of ups8 --- src/strategies/avx2/intra-avx2.c | 360 ++++++++++--------------------- 1 file changed, 118 insertions(+), 242 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index abe8217f..413591bb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5587,7 +5587,6 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg } - static void mip_upsampling_w32_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 4; @@ -5698,261 +5697,138 @@ static void mip_upsampling_w32_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pix static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 8; // width / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - - __m128i vshuf = _mm_setr_epi8( - 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, - 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f - ); - - __m128i vrnd = _mm_set1_epi16(rounding_offset); - - ALIGNED(32) int16_t refs[8]; - ALIGNED(32) int16_t srcs[8]; const uvg_pixel* ref_ptr = ref + ref_step - 1; const uvg_pixel* src_ptr = src; - - int step = ref_step; - - for (int i = 0; i < 8; i++) { - for (int ref = 0; ref < 8; ++ref) { - refs[ref] = *ref_ptr; - srcs[ref] = *src_ptr; - - ref_ptr += step; - src_ptr += red_pred_size; - } - - __m128i vaccu_ref = _mm_load_si128((__m128i*)refs); - __m128i vsub_ref = vaccu_ref; - vaccu_ref = _mm_slli_epi16(vaccu_ref, log2_factor); - - __m128i vaccu_src = _mm_setzero_si128(); - __m128i vadd_src = _mm_load_si128((__m128i*)srcs); - - __m128i vres[8]; - for (int res = 0; res < 8; ++res) { - vaccu_ref = _mm_sub_epi16(vaccu_ref, vsub_ref); - vaccu_src = _mm_add_epi16(vaccu_src, vadd_src); - vres[res] = _mm_add_epi16(vaccu_ref, vaccu_src); - vres[res] = _mm_add_epi16(vres[res], vrnd); - vres[res] = _mm_srli_epi16(vres[res], log2_factor); - } - - __m128i vout0 = _mm_packus_epi16(vres[0], vres[1]); - __m128i vout1 = _mm_packus_epi16(vres[2], vres[3]); - __m128i vout2 = _mm_packus_epi16(vres[4], vres[5]); - __m128i vout3 = _mm_packus_epi16(vres[6], vres[7]); - vout0 = _mm_shuffle_epi8(vout0, vshuf); - vout1 = _mm_shuffle_epi8(vout1, vshuf); - vout2 = _mm_shuffle_epi8(vout2, vshuf); - vout3 = _mm_shuffle_epi8(vout3, vshuf); - - __m128i vtmp16lo0 = _mm_unpacklo_epi16(vout0, vout1); - __m128i vtmp16hi0 = _mm_unpackhi_epi16(vout0, vout1); - __m128i vtmp16lo1 = _mm_unpacklo_epi16(vout2, vout3); - __m128i vtmp16hi1 = _mm_unpackhi_epi16(vout2, vout3); - - __m128i vtmp32lo0 = _mm_unpacklo_epi32(vtmp16lo0, vtmp16lo1); - __m128i vtmp32hi0 = _mm_unpackhi_epi32(vtmp16lo0, vtmp16lo1); - __m128i vtmp32lo1 = _mm_unpacklo_epi32(vtmp16hi0, vtmp16hi1); - __m128i vtmp32hi1 = _mm_unpackhi_epi32(vtmp16hi0, vtmp16hi1); - - const int dst_offset = i * 8; - - *(uint64_t*)&dst[dst_offset + dst_step * 0] = _mm_extract_epi64(vtmp32lo0, 0); - *(uint64_t*)&dst[dst_offset + dst_step * 1] = _mm_extract_epi64(vtmp32lo0, 1); - *(uint64_t*)&dst[dst_offset + dst_step * 2] = _mm_extract_epi64(vtmp32hi0, 0); - *(uint64_t*)&dst[dst_offset + dst_step * 3] = _mm_extract_epi64(vtmp32hi0, 1); - *(uint64_t*)&dst[dst_offset + dst_step * 4] = _mm_extract_epi64(vtmp32lo1, 0); - *(uint64_t*)&dst[dst_offset + dst_step * 5] = _mm_extract_epi64(vtmp32lo1, 1); - *(uint64_t*)&dst[dst_offset + dst_step * 6] = _mm_extract_epi64(vtmp32hi1, 0); - *(uint64_t*)&dst[dst_offset + dst_step * 7] = _mm_extract_epi64(vtmp32hi1, 1); - - ref_ptr = src + i; - src_ptr = src + i + 1; - step = red_pred_size; // Switch ref step - } -} - -static void mip_upsampling_w64_ups8_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 8; // width / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - - __m128i vshufsrc = _mm_setr_epi8( - 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, - 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d - ); - - __m128i vshuf0 = _mm_setr_epi8( - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 - ); - - __m128i vshuf1 = _mm_setr_epi8( - 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, - 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03 - ); - - __m128i vshuf2 = _mm_setr_epi8( - 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, - 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05 - ); - - __m128i vshuf3 = _mm_setr_epi8( - 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, - 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 - ); - - __m128i vshuf4 = _mm_setr_epi8( - 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, - 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09 - ); - - __m128i vshuf5 = _mm_setr_epi8( - 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, - 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b - ); - - __m128i vshuf6 = _mm_setr_epi8( - 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, - 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d - ); - - __m128i vshuf7 = _mm_setr_epi8( - 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, - 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f + const uvg_pixel* dst_ptr = dst; + + const __m256i ones = _mm256_set1_epi8(1); + const __m256i twos = _mm256_set1_epi8(2); + const __m256i threes = _mm256_set1_epi8(3); + const __m256i fours = _mm256_set1_epi8(4); + const __m256i fives = _mm256_set1_epi8(5); + const __m256i sixes = _mm256_set1_epi8(6); + const __m256i sevens = _mm256_set1_epi8(7); + const __m256i eights = _mm256_set1_epi8(8); + + __m256i shuffle_mask = _mm256_setr_epi8( + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f ); - __m128i vrnd = _mm_set1_epi16(rounding_offset); - __m128i vmul = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); - const uvg_pixel* ref_ptr = ref + ref_step - 1; - const uvg_pixel* src_ptr = src; - - int step = ref_step; - - uvg_pixel* dst_ptr = dst; - - for (int i = 0; i < 8; i++) { - // Handle input data - int16_t before = *ref_ptr; - __m128i vtmp = _mm_loadu_si128((__m128i*)src_ptr); - __m128i vbehind = _mm_cvtepu8_epi16(vtmp); - - __m128i vbefore = vbehind; - vbefore = _mm_shuffle_epi8(vbefore, vshufsrc); - vbefore = _mm_insert_epi16(vbefore, before, 0); - __m128i vbeforeshifted = _mm_slli_epi16(vbefore, log2_factor); - - __m128i vinterpolate = _mm_sub_epi16(vbehind, vbefore); - - // Calculate 1st 16 result chunk - __m128i vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf0); - __m128i vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf0); - - __m128i vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); - vmulres0 = _mm_add_epi16(vmulres0, vbefore0); - - vmulres0 = _mm_add_epi16(vmulres0, vrnd); - vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); - - __m128i vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf1); - __m128i vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf1); - - __m128i vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); - vmulres1 = _mm_add_epi16(vmulres1, vbefore1); - - vmulres1 = _mm_add_epi16(vmulres1, vrnd); - vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); - - __m128i vres = _mm_packus_epi16(vmulres0, vmulres1); - - _mm_store_si128((__m128i*)(dst_ptr + 0), vres); - - // Calculate 2nd 16 result chunk - vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf2); - vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf2); - - vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); - vmulres0 = _mm_add_epi16(vmulres0, vbefore0); - - vmulres0 = _mm_add_epi16(vmulres0, vrnd); - vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); - - vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf3); - vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf3); - - vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); - vmulres1 = _mm_add_epi16(vmulres1, vbefore1); - - vmulres1 = _mm_add_epi16(vmulres1, vrnd); - vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); - - vres = _mm_packus_epi16(vmulres0, vmulres1); - - _mm_store_si128((__m128i*)(dst_ptr + 16), vres); - - // Calculate 3rd 16 result chunk - vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf4); - vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf4); - - vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); - vmulres0 = _mm_add_epi16(vmulres0, vbefore0); - - vmulres0 = _mm_add_epi16(vmulres0, vrnd); - vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); - - vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf5); - vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf5); - - vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); - vmulres1 = _mm_add_epi16(vmulres1, vbefore1); - - vmulres1 = _mm_add_epi16(vmulres1, vrnd); - vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); - - vres = _mm_packus_epi16(vmulres0, vmulres1); - - _mm_store_si128((__m128i*)(dst_ptr + 32), vres); - - // Calculate 4th 16 result chunk - vbefore0 = _mm_shuffle_epi8(vbeforeshifted, vshuf6); - vinterpolate0 = _mm_shuffle_epi8(vinterpolate, vshuf6); + // This will process 2 rows at a time. Limit is always 8 rows. + for (int i = 0; i < 2; ++i) { - vmulres0 = _mm_mullo_epi16(vinterpolate0, vmul); - vmulres0 = _mm_add_epi16(vmulres0, vbefore0); + // Assign references by hand after copying sources. This will avoid the use of inserts later. + ALIGNED(32) uint8_t before[33]; + memcpy(&before[1], src_ptr, 32); + before[0] = ref_ptr[ref_step * 0]; + before[8] = ref_ptr[ref_step * 1]; + before[16] = ref_ptr[ref_step * 2]; + before[24] = ref_ptr[ref_step * 3]; - vmulres0 = _mm_add_epi16(vmulres0, vrnd); - vmulres0 = _mm_srai_epi16(vmulres0, log2_factor); + __m256i vbefore = _mm256_load_si256((__m256i*)before); + __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); - vbefore1 = _mm_shuffle_epi8(vbeforeshifted, vshuf7); - vinterpolate1 = _mm_shuffle_epi8(vinterpolate, vshuf7); + // Permute the input values to get the result in correct order. + vbefore = _mm256_shuffle_epi8(vbefore, shuffle_mask); + vbehind = _mm256_shuffle_epi8(vbehind, shuffle_mask); + vbefore = _mm256_permute4x64_epi64(vbefore, _MM_SHUFFLE(3, 1, 2, 0)); + vbehind = _mm256_permute4x64_epi64(vbehind, _MM_SHUFFLE(3, 1, 2, 0)); - vmulres1 = _mm_mullo_epi16(vinterpolate1, vmul); - vmulres1 = _mm_add_epi16(vmulres1, vbefore1); + // Calculate the 7 interpolated values between before and behind, middle, left and right. + __m256i vmiddle = _mm256_avg_epu8(vbefore, vbehind); + __m256i vleft_middle = _mm256_avg_epu8(vmiddle, vbefore); + __m256i vright_middle = _mm256_avg_epu8(vmiddle, vbehind); + __m256i vleft_left = _mm256_avg_epu8(vbefore, vleft_middle); + __m256i vleft_right = _mm256_avg_epu8(vleft_middle, vmiddle); + __m256i vright_left = _mm256_avg_epu8(vmiddle, vright_middle); + __m256i vright_right = _mm256_avg_epu8(vright_middle, vbehind); + + // Calculate the three and two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + __m256i diff = _mm256_sub_epi8(vbehind, vbefore); + diff = _mm256_and_si256(diff, sevens); + __m256i three_diff = _mm256_and_si256(diff, threes); + + // Right side + __m256i mask = _mm256_cmpgt_epi8(diff, fours); // The rounding error mask will be generated based on the calculated last bits. + __m256i sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); // If 5, 6, 7 select one + vright_right = _mm256_sub_epi8(vright_right, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, threes); + sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); // If 3 or 7 select one + vright_middle = _mm256_sub_epi8(vright_middle, sub_amount); + + __m256i is_two = _mm256_cmpeq_epi8(diff, twos); + __m256i is_five = _mm256_cmpeq_epi8(diff, fives); + mask = _mm256_or_si256(mask, is_two); + mask = _mm256_or_si256(mask, is_five); + sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); // If 2, 3, 5, or 7 select one + vright_left = _mm256_sub_epi8(vright_left, sub_amount); + + // Left side + diff = _mm256_blendv_epi8(diff, eights, _mm256_cmpeq_epi8(_mm256_set1_epi8(0), diff)); // Replace zeros with eights to enable using GT + mask = _mm256_cmpgt_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(ones, _mm256_set1_epi8(0), mask); // If greater than three select zero + vleft_left = _mm256_sub_epi8(vleft_left, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, ones); + sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); // If 1 or 5 select one + vleft_middle = _mm256_sub_epi8(vleft_middle, sub_amount); - vmulres1 = _mm_add_epi16(vmulres1, vrnd); - vmulres1 = _mm_srai_epi16(vmulres1, log2_factor); + __m256i is_three = _mm256_cmpeq_epi8(diff, threes); + __m256i is_six = _mm256_cmpeq_epi8(diff, sixes); + mask = _mm256_or_si256(mask, is_three); + mask = _mm256_or_si256(mask, is_six); + sub_amount = _mm256_blendv_epi8(_mm256_set1_epi8(0), ones, mask); // If 1, 3, 5, 6 select one + vleft_right = _mm256_sub_epi8(vleft_right, sub_amount); - vres = _mm_packus_epi16(vmulres0, vmulres1); + // Interleave results. + __m256i left_left_temp0 = _mm256_unpacklo_epi8(vleft_left, vleft_middle); + __m256i left_left_temp1 = _mm256_unpackhi_epi8(vleft_left, vleft_middle); + __m256i left_right_temp0 = _mm256_unpacklo_epi8(vleft_right, vmiddle); + __m256i left_right_temp1 = _mm256_unpackhi_epi8(vleft_right, vmiddle); + __m256i right_left_temp0 = _mm256_unpacklo_epi8(vright_left, vright_middle); + __m256i right_left_temp1 = _mm256_unpackhi_epi8(vright_left, vright_middle); + __m256i right_right_temp0 = _mm256_unpacklo_epi8(vright_right, vbehind); + __m256i right_right_temp1 = _mm256_unpackhi_epi8(vright_right, vbehind); + + __m256i vleft_temp0 = _mm256_unpacklo_epi16(left_left_temp0, left_right_temp0); + __m256i vleft_temp1 = _mm256_unpackhi_epi16(left_left_temp0, left_right_temp0); + __m256i vleft_temp2 = _mm256_unpacklo_epi16(left_left_temp1, left_right_temp1); + __m256i vleft_temp3 = _mm256_unpackhi_epi16(left_left_temp1, left_right_temp1); + __m256i vright_temp0 = _mm256_unpacklo_epi16(right_left_temp0, right_right_temp0); + __m256i vright_temp1 = _mm256_unpackhi_epi16(right_left_temp0, right_right_temp0); + __m256i vright_temp2 = _mm256_unpacklo_epi16(right_left_temp1, right_right_temp1); + __m256i vright_temp3 = _mm256_unpackhi_epi16(right_left_temp1, right_right_temp1); + + __m256i vtmp0 = _mm256_unpacklo_epi32(vleft_temp0, vright_temp0); + __m256i vtmp1 = _mm256_unpackhi_epi32(vleft_temp0, vright_temp0); + __m256i vtmp2 = _mm256_unpacklo_epi32(vleft_temp1, vright_temp1); + __m256i vtmp3 = _mm256_unpackhi_epi32(vleft_temp1, vright_temp1); + __m256i vtmp4 = _mm256_unpacklo_epi32(vleft_temp2, vright_temp2); + __m256i vtmp5 = _mm256_unpackhi_epi32(vleft_temp2, vright_temp2); + __m256i vtmp6 = _mm256_unpacklo_epi32(vleft_temp3, vright_temp3); + __m256i vtmp7 = _mm256_unpackhi_epi32(vleft_temp3, vright_temp3); + + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 0 + 00), vtmp0); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 0 + 32), vtmp1); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 1 + 00), vtmp2); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 1 + 32), vtmp3); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 2 + 00), vtmp4); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 2 + 32), vtmp5); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 3 + 00), vtmp6); + _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 3 + 32), vtmp7); - _mm_store_si128((__m128i*)(dst_ptr + 48), vres); - - dst_ptr += dst_step; - ref_ptr += ref_step; - src_ptr += red_pred_size; + src_ptr += 32; + ref_ptr += ref_step * 4; + dst_ptr += dst_step * 4; } } + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6113,7 +5989,7 @@ void mip_predict_avx2( } break; case 64: - mip_upsampling_w64_ups8_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 8. + mip_upsampling_w64_ups8_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 8. break; default: assert(false && "Invalid MIP width.\n"); From ffc762dc526747806804e19730cc24f4045851dc Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 4 Apr 2024 18:26:41 +0300 Subject: [PATCH 128/237] Implement w4 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 119 +++++++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 413591bb..47ed231c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5829,6 +5829,94 @@ static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pix } +static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 8; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + /*__m128i vshufbefore = _mm_setr_epi8( + 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b + );*/ + + __m256i vshufres = _mm256_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f + ); + + int32_t refline = *(int32_t*)ref; + //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); + //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); + __m128i vbehind = _mm_loadu_si128((__m128i*)src); + __m128i vbefore = vbehind; + + vbefore = _mm_slli_si128(vbefore, 4); // Shift left to make room for one 32-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore = _mm_insert_epi32(vbefore, refline, 0); + + __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); + __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + __m256i vrow3 = _mm256_add_epi16(vrow2, vinterpolate); + __m256i vrow4 = _mm256_add_epi16(vrow3, vinterpolate); + __m256i vrow5 = _mm256_add_epi16(vrow4, vinterpolate); + __m256i vrow6 = _mm256_add_epi16(vrow5, vinterpolate); + + vrow0 = _mm256_add_epi16(vrow0, vrnd); + vrow1 = _mm256_add_epi16(vrow1, vrnd); + vrow2 = _mm256_add_epi16(vrow2, vrnd); + vrow3 = _mm256_add_epi16(vrow3, vrnd); + vrow4 = _mm256_add_epi16(vrow4, vrnd); + vrow5 = _mm256_add_epi16(vrow5, vrnd); + vrow6 = _mm256_add_epi16(vrow6, vrnd); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + vrow3 = _mm256_srai_epi16(vrow3, log2_factor); + vrow4 = _mm256_srai_epi16(vrow4, log2_factor); + vrow5 = _mm256_srai_epi16(vrow5, log2_factor); + vrow6 = _mm256_srai_epi16(vrow6, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vrow3); + __m256i vres2 = _mm256_packus_epi16(vrow4, vrow5); + __m256i vres3 = _mm256_packus_epi16(vrow6, vbehind256); + + vres0 = _mm256_shuffle_epi8(vres0, vshufres); + vres1 = _mm256_shuffle_epi8(vres1, vshufres); + vres2 = _mm256_shuffle_epi8(vres2, vshufres); + vres3 = _mm256_shuffle_epi8(vres3, vshufres); + + __m256i vupklo0 = _mm256_unpacklo_epi64(vres0, vres1); + __m256i vupklo1 = _mm256_unpacklo_epi64(vres2, vres3); + __m256i vupkhi0 = _mm256_unpackhi_epi64(vres0, vres1); + __m256i vupkhi1 = _mm256_unpackhi_epi64(vres2, vres3); + + vres0 = _mm256_permute2x128_si256(vupklo0, vupklo1, 0x20); + vres1 = _mm256_permute2x128_si256(vupkhi0, vupkhi1, 0x20); + vres2 = _mm256_permute2x128_si256(vupklo0, vupklo1, 0x31); + vres3 = _mm256_permute2x128_si256(vupkhi0, vupkhi1, 0x31); + + _mm256_store_si256((__m256i*)(dst + 0), vres0); + _mm256_store_si256((__m256i*)(dst + 32), vres1); + _mm256_store_si256((__m256i*)(dst + 64), vres2); + _mm256_store_si256((__m256i*)(dst + 96), vres3); +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -5997,15 +6085,34 @@ void mip_predict_avx2( } } + // void uvg_mip_pred_upsampling_1D_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, + // const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, + // const uint16_t src_step, const uint8_t src_stride, + // const uint8_t dst_step, const uint8_t dst_stride, + // const uint8_t boundary_step, + // const uint8_t ups_factor) + + uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { - switch (size_id) { - case 0: assert(false && "MIP upscale. Invalid size id.\n"); break; // No upscale is needed for size id 0 - case 1: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; - case 2: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + switch (width) { + case 4: + if (ups_ver_factor == 2) + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + else if (ups_ver_factor == 4) + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + else + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w4_ups8_ver_avx2(result, ver_src, ref_samples_top); + break; + + case 8: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + case 16: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + case 32: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + case 64: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; default: - assert(false && "Intra MIP: invalid size id.\n"); + assert(false && "Invalid MIP width.\n"); break; - } + } } } From fc700c45391fb7206136f5a41cbb08e02d59b8c6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 4 Apr 2024 19:01:24 +0300 Subject: [PATCH 129/237] Implement w4 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 175 +++++++++++++++++++++++++++++-- 1 file changed, 167 insertions(+), 8 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 47ed231c..c4703e97 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5503,7 +5503,6 @@ static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix } } - static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -5586,7 +5585,6 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg } } - static void mip_upsampling_w32_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 4; @@ -5829,6 +5827,163 @@ static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pix } +static void mip_upsampling_w4_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 2; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + /*__m128i vshufbefore = _mm_setr_epi8( + 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b + );*/ + + __m256i vshufres = _mm256_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f + ); + + int32_t refline = *(int32_t*)ref; + //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); + //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); + __m128i vbehind = _mm_loadu_si128((__m128i*)src); + __m128i vbefore = vbehind; + + vbefore = _mm_slli_si128(vbefore, 4); // Shift left to make room for one 32-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore = _mm_insert_epi32(vbefore, refline, 0); + + __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); + __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + __m256i vrow3 = _mm256_add_epi16(vrow2, vinterpolate); + __m256i vrow4 = _mm256_add_epi16(vrow3, vinterpolate); + __m256i vrow5 = _mm256_add_epi16(vrow4, vinterpolate); + __m256i vrow6 = _mm256_add_epi16(vrow5, vinterpolate); + + vrow0 = _mm256_add_epi16(vrow0, vrnd); + vrow1 = _mm256_add_epi16(vrow1, vrnd); + vrow2 = _mm256_add_epi16(vrow2, vrnd); + vrow3 = _mm256_add_epi16(vrow3, vrnd); + vrow4 = _mm256_add_epi16(vrow4, vrnd); + vrow5 = _mm256_add_epi16(vrow5, vrnd); + vrow6 = _mm256_add_epi16(vrow6, vrnd); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + vrow3 = _mm256_srai_epi16(vrow3, log2_factor); + vrow4 = _mm256_srai_epi16(vrow4, log2_factor); + vrow5 = _mm256_srai_epi16(vrow5, log2_factor); + vrow6 = _mm256_srai_epi16(vrow6, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vrow3); + __m256i vres2 = _mm256_packus_epi16(vrow4, vrow5); + __m256i vres3 = _mm256_packus_epi16(vrow6, vbehind256); + + vres0 = _mm256_shuffle_epi8(vres0, vshufres); + vres1 = _mm256_shuffle_epi8(vres1, vshufres); + vres2 = _mm256_shuffle_epi8(vres2, vshufres); + vres3 = _mm256_shuffle_epi8(vres3, vshufres); + + __m256i vupklo0 = _mm256_unpacklo_epi64(vres0, vres1); + __m256i vupklo1 = _mm256_unpacklo_epi64(vres2, vres3); + __m256i vupkhi0 = _mm256_unpackhi_epi64(vres0, vres1); + __m256i vupkhi1 = _mm256_unpackhi_epi64(vres2, vres3); + + vres0 = _mm256_permute2x128_si256(vupklo0, vupklo1, 0x20); + vres1 = _mm256_permute2x128_si256(vupkhi0, vupkhi1, 0x20); + vres2 = _mm256_permute2x128_si256(vupklo0, vupklo1, 0x31); + vres3 = _mm256_permute2x128_si256(vupkhi0, vupkhi1, 0x31); + + _mm256_store_si256((__m256i*)(dst + 0), vres0); + _mm256_store_si256((__m256i*)(dst + 32), vres1); + _mm256_store_si256((__m256i*)(dst + 64), vres2); + _mm256_store_si256((__m256i*)(dst + 96), vres3); +} + +static void mip_upsampling_w4_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 4; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + /*__m128i vshufbefore = _mm_setr_epi8( + 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b + );*/ + + __m256i vshufres = _mm256_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f + ); + + __m256i vperm = _mm256_setr_epi32(0, 1, 4, 5, 8, 9, 12, 13); + + int32_t refline = *(int32_t*)ref; + //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); + //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); + __m128i vbehind = _mm_loadu_si128((__m128i*)src); + __m128i vbefore = vbehind; + + vbefore = _mm_slli_si128(vbefore, 4); // Shift left to make room for one 32-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore = _mm_insert_epi32(vbefore, refline, 0); + + __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); + __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + + vrow0 = _mm256_add_epi16(vrow0, vrnd); + vrow1 = _mm256_add_epi16(vrow1, vrnd); + vrow2 = _mm256_add_epi16(vrow2, vrnd); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256); + + vres0 = _mm256_shuffle_epi8(vres0, vshufres); + vres1 = _mm256_shuffle_epi8(vres1, vshufres); + + __m256i vlo128 = _mm256_permute2x128_si256(vres0, vres1, 0x20); + __m256i vhi128 = _mm256_permute2x128_si256(vres0, vres1, 0x31); + + vres0 = _mm256_permute4x64_epi64(vlo128, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vhi128, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + 0), vres0); + _mm256_store_si256((__m256i*)(dst + 32), vres1); + +} + static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 4; @@ -6092,17 +6247,21 @@ void mip_predict_avx2( // const uint8_t boundary_step, // const uint8_t ups_factor) - uvg_pixel tmp[64 * 64] = {0}; + //uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: - if (ups_ver_factor == 2) + if (ups_ver_factor == 2) { uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - else if (ups_ver_factor == 4) - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - else - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + } + else if (ups_ver_factor == 4) { + //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w4_ups4_ver_avx2(result, ver_src, ref_samples_top); + } + else { + mip_upsampling_w4_ups8_ver_avx2(result, ver_src, ref_samples_top); + } break; case 8: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; From 7e53861761b72e82abaecc9c89445bc1e1a614e6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 4 Apr 2024 19:10:49 +0300 Subject: [PATCH 130/237] Implement w4 vertical 1 to 2 upsampling. --- src/strategies/avx2/intra-avx2.c | 82 +++----------------------------- 1 file changed, 7 insertions(+), 75 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c4703e97..89616b0c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5832,87 +5832,20 @@ static void mip_upsampling_w4_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixe const uint8_t red_pred_size = 4; const uint8_t ups_factor = 2; // height / red_pred_size - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - - /*__m128i vshufbefore = _mm_setr_epi8( - 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b - );*/ - - __m256i vshufres = _mm256_setr_epi8( - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f - ); - int32_t refline = *(int32_t*)ref; - //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); - //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); __m128i vbehind = _mm_loadu_si128((__m128i*)src); __m128i vbefore = vbehind; vbefore = _mm_slli_si128(vbefore, 4); // Shift left to make room for one 32-bit integer. This could be done with a shuffle, but there should be no performance difference. vbefore = _mm_insert_epi32(vbefore, refline, 0); - __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); - __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); - - __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); - - __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); - - __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); - __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); - __m256i vrow3 = _mm256_add_epi16(vrow2, vinterpolate); - __m256i vrow4 = _mm256_add_epi16(vrow3, vinterpolate); - __m256i vrow5 = _mm256_add_epi16(vrow4, vinterpolate); - __m256i vrow6 = _mm256_add_epi16(vrow5, vinterpolate); - - vrow0 = _mm256_add_epi16(vrow0, vrnd); - vrow1 = _mm256_add_epi16(vrow1, vrnd); - vrow2 = _mm256_add_epi16(vrow2, vrnd); - vrow3 = _mm256_add_epi16(vrow3, vrnd); - vrow4 = _mm256_add_epi16(vrow4, vrnd); - vrow5 = _mm256_add_epi16(vrow5, vrnd); - vrow6 = _mm256_add_epi16(vrow6, vrnd); - - vrow0 = _mm256_srai_epi16(vrow0, log2_factor); - vrow1 = _mm256_srai_epi16(vrow1, log2_factor); - vrow2 = _mm256_srai_epi16(vrow2, log2_factor); - vrow3 = _mm256_srai_epi16(vrow3, log2_factor); - vrow4 = _mm256_srai_epi16(vrow4, log2_factor); - vrow5 = _mm256_srai_epi16(vrow5, log2_factor); - vrow6 = _mm256_srai_epi16(vrow6, log2_factor); - - __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); - __m256i vres1 = _mm256_packus_epi16(vrow2, vrow3); - __m256i vres2 = _mm256_packus_epi16(vrow4, vrow5); - __m256i vres3 = _mm256_packus_epi16(vrow6, vbehind256); - - vres0 = _mm256_shuffle_epi8(vres0, vshufres); - vres1 = _mm256_shuffle_epi8(vres1, vshufres); - vres2 = _mm256_shuffle_epi8(vres2, vshufres); - vres3 = _mm256_shuffle_epi8(vres3, vshufres); - - __m256i vupklo0 = _mm256_unpacklo_epi64(vres0, vres1); - __m256i vupklo1 = _mm256_unpacklo_epi64(vres2, vres3); - __m256i vupkhi0 = _mm256_unpackhi_epi64(vres0, vres1); - __m256i vupkhi1 = _mm256_unpackhi_epi64(vres2, vres3); + __m128i vavg = _mm_avg_epu8(vbefore, vbehind); - vres0 = _mm256_permute2x128_si256(vupklo0, vupklo1, 0x20); - vres1 = _mm256_permute2x128_si256(vupkhi0, vupkhi1, 0x20); - vres2 = _mm256_permute2x128_si256(vupklo0, vupklo1, 0x31); - vres3 = _mm256_permute2x128_si256(vupkhi0, vupkhi1, 0x31); + __m128i vres0 = _mm_unpacklo_epi32(vavg, vbehind); + __m128i vres1 = _mm_unpackhi_epi32(vavg, vbehind); - _mm256_store_si256((__m256i*)(dst + 0), vres0); - _mm256_store_si256((__m256i*)(dst + 32), vres1); - _mm256_store_si256((__m256i*)(dst + 64), vres2); - _mm256_store_si256((__m256i*)(dst + 96), vres3); + _mm_store_si128((__m128i*)(dst + 0), vres0); + _mm_store_si128((__m128i*)(dst + 16), vres1); } static void mip_upsampling_w4_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) @@ -6252,14 +6185,13 @@ void mip_predict_avx2( switch (width) { case 4: if (ups_ver_factor == 2) { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w4_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w4_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - mip_upsampling_w4_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; From 010483c2b55b09871ae766c2e20d165bce31cc8c Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 4 Apr 2024 20:13:54 +0300 Subject: [PATCH 131/237] WIP on w8 ups2. --- src/strategies/avx2/intra-avx2.c | 47 ++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 89616b0c..1d107682 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6005,6 +6005,38 @@ static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixe _mm256_store_si256((__m256i*)(dst + 96), vres3); } +static void mip_upsampling_w8_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const int height) +{ + int64_t refline = *(int64_t*)ref; + __m128i vidx0 = _mm_set_epi64x(0, 2); + __m128i vidx1 = _mm_set_epi64x(4, 6); + + __m128i vbehind0 = _mm_i64gather_epi64((const long long*)src, vidx0, 1); + __m128i vbehind1 = _mm_i64gather_epi64((const long long*)src, vidx1, 1); + + __m128i vbefore0 = vbehind0; + vbefore0 = _mm_slli_si128(vbefore0, 8); // Shift left to make room for one 64-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore0 = _mm_insert_epi64(vbefore0, refline, 0); + + __m128i vbefore1 = vbehind1; + vbefore1 = _mm_slli_si128(vbefore1, 8); + + vbefore1 = _mm_blend_epi32(vbefore1, vbehind1, 0b0011); + + __m128i kek = _mm_setzero_si128(); + + + // Shuffle inputs to get the results in correct order. + + /*__m128i vavg = _mm_avg_epu8(vbefore, vbehind); + + __m128i vres0 = _mm_unpacklo_epi32(vavg, vbehind); + __m128i vres1 = _mm_unpackhi_epi32(vavg, vbehind); + + _mm_store_si128((__m128i*)(dst + 0), vres0); + _mm_store_si128((__m128i*)(dst + 16), vres1);*/ +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6180,7 +6212,7 @@ void mip_predict_avx2( // const uint8_t boundary_step, // const uint8_t ups_factor) - //uvg_pixel tmp[64 * 64] = {0}; + uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -6196,7 +6228,18 @@ void mip_predict_avx2( } break; - case 8: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + case 8: + if (ups_ver_factor == 2) { + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w8_ups2_ver_avx2(result, ver_src, ref_samples_top, height); + } + else if (ups_ver_factor == 4) { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + } + else { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + } + break; case 16: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; case 32: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; case 64: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; From 69f8be91360f9c636e5f7323ab19ac66762b2a21 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 5 Apr 2024 16:41:57 +0300 Subject: [PATCH 132/237] Implement w8 vertical 1 to 2 upsampling for height 8 and 16. --- src/strategies/avx2/intra-avx2.c | 173 +++++++++++++++++++++++++------ 1 file changed, 143 insertions(+), 30 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1d107682..abee5494 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5870,8 +5870,6 @@ static void mip_upsampling_w4_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f ); - __m256i vperm = _mm256_setr_epi32(0, 1, 4, 5, 8, 9, 12, 13); - int32_t refline = *(int32_t*)ref; //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); @@ -5886,15 +5884,18 @@ static void mip_upsampling_w4_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); - vrow0 = _mm256_add_epi16(vrow0, vrnd); + /*vrow0 = _mm256_add_epi16(vrow0, vrnd); vrow1 = _mm256_add_epi16(vrow1, vrnd); - vrow2 = _mm256_add_epi16(vrow2, vrnd); + vrow2 = _mm256_add_epi16(vrow2, vrnd);*/ vrow0 = _mm256_srai_epi16(vrow0, log2_factor); vrow1 = _mm256_srai_epi16(vrow1, log2_factor); @@ -5953,6 +5954,9 @@ static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixe __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); @@ -5963,13 +5967,14 @@ static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixe __m256i vrow5 = _mm256_add_epi16(vrow4, vinterpolate); __m256i vrow6 = _mm256_add_epi16(vrow5, vinterpolate); + /* vrow0 = _mm256_add_epi16(vrow0, vrnd); vrow1 = _mm256_add_epi16(vrow1, vrnd); vrow2 = _mm256_add_epi16(vrow2, vrnd); vrow3 = _mm256_add_epi16(vrow3, vrnd); vrow4 = _mm256_add_epi16(vrow4, vrnd); vrow5 = _mm256_add_epi16(vrow5, vrnd); - vrow6 = _mm256_add_epi16(vrow6, vrnd); + vrow6 = _mm256_add_epi16(vrow6, vrnd);*/ vrow0 = _mm256_srai_epi16(vrow0, log2_factor); vrow1 = _mm256_srai_epi16(vrow1, log2_factor); @@ -6005,36 +6010,146 @@ static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixe _mm256_store_si256((__m256i*)(dst + 96), vres3); } -static void mip_upsampling_w8_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const int height) +static void mip_upsampling_w8_ups2_h8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { int64_t refline = *(int64_t*)ref; - __m128i vidx0 = _mm_set_epi64x(0, 2); - __m128i vidx1 = _mm_set_epi64x(4, 6); + __m128i vidx0 = _mm_set_epi64x(16, 0); + __m128i vidx1 = _mm_set_epi64x(32, 16); + __m128i vidx2 = _mm_set_epi64x(48, 32); __m128i vbehind0 = _mm_i64gather_epi64((const long long*)src, vidx0, 1); - __m128i vbehind1 = _mm_i64gather_epi64((const long long*)src, vidx1, 1); + __m128i vbefore1 = _mm_i64gather_epi64((const long long*)src, vidx1, 1); + __m128i vbehind1 = _mm_i64gather_epi64((const long long*)src, vidx2, 1); __m128i vbefore0 = vbehind0; vbefore0 = _mm_slli_si128(vbefore0, 8); // Shift left to make room for one 64-bit integer. This could be done with a shuffle, but there should be no performance difference. vbefore0 = _mm_insert_epi64(vbefore0, refline, 0); - __m128i vbefore1 = vbehind1; - vbefore1 = _mm_slli_si128(vbefore1, 8); - - vbefore1 = _mm_blend_epi32(vbefore1, vbehind1, 0b0011); + __m128i vavg0 = _mm_avg_epu8(vbefore0, vbehind0); + __m128i vavg1 = _mm_avg_epu8(vbefore1, vbehind1); - __m128i kek = _mm_setzero_si128(); + __m128i vres0 = _mm_unpacklo_epi64(vavg0, vbehind0); + __m128i vres1 = _mm_unpackhi_epi64(vavg0, vbehind0); + __m128i vres2 = _mm_unpacklo_epi64(vavg1, vbehind1); + __m128i vres3 = _mm_unpackhi_epi64(vavg1, vbehind1); + _mm_store_si128((__m128i*)(dst + 0), vres0); + _mm_store_si128((__m128i*)(dst + 16), vres1); + _mm_store_si128((__m128i*)(dst + 32), vres2); + _mm_store_si128((__m128i*)(dst + 48), vres3); +} + +static void mip_upsampling_w8_ups2_h16_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + int64_t refline = *(int64_t*)ref; + __m128i vbehind0 = _mm_load_si128((__m128i*)(src + 0)); + __m128i vbefore1 = _mm_load_si128((__m128i*)(src + 8)); + __m128i vbehind1 = _mm_load_si128((__m128i*)(src + 16)); - // Shuffle inputs to get the results in correct order. + __m128i vbefore0 = vbehind0; + vbefore0 = _mm_slli_si128(vbefore0, 8); // Shift left to make room for one 64-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore0 = _mm_insert_epi64(vbefore0, refline, 0); - /*__m128i vavg = _mm_avg_epu8(vbefore, vbehind); + __m128i vavg0 = _mm_avg_epu8(vbefore0, vbehind0); + __m128i vavg1 = _mm_avg_epu8(vbefore1, vbehind1); - __m128i vres0 = _mm_unpacklo_epi32(vavg, vbehind); - __m128i vres1 = _mm_unpackhi_epi32(vavg, vbehind); + __m128i vres0 = _mm_unpacklo_epi64(vavg0, vbehind0); + __m128i vres1 = _mm_unpackhi_epi64(vavg0, vbehind0); + __m128i vres2 = _mm_unpacklo_epi64(vavg1, vbehind1); + __m128i vres3 = _mm_unpackhi_epi64(vavg1, vbehind1); _mm_store_si128((__m128i*)(dst + 0), vres0); - _mm_store_si128((__m128i*)(dst + 16), vres1);*/ + _mm_store_si128((__m128i*)(dst + 16), vres1); + _mm_store_si128((__m128i*)(dst + 32), vres2); + _mm_store_si128((__m128i*)(dst + 48), vres3); + + vbefore0 = _mm_load_si128((__m128i*)(src + 24)); + vbehind0 = _mm_load_si128((__m128i*)(src + 32)); + vbefore1 = _mm_load_si128((__m128i*)(src + 40)); + vbehind1 = _mm_load_si128((__m128i*)(src + 48)); + + vavg0 = _mm_avg_epu8(vbefore0, vbehind0); + vavg1 = _mm_avg_epu8(vbefore1, vbehind1); + + vres0 = _mm_unpacklo_epi64(vavg0, vbehind0); + vres1 = _mm_unpackhi_epi64(vavg0, vbehind0); + vres2 = _mm_unpacklo_epi64(vavg1, vbehind1); + vres3 = _mm_unpackhi_epi64(vavg1, vbehind1); + + _mm_store_si128((__m128i*)(dst + 64), vres0); + _mm_store_si128((__m128i*)(dst + 80), vres1); + _mm_store_si128((__m128i*)(dst + 96), vres2); + _mm_store_si128((__m128i*)(dst + 112), vres3); +} + +static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 4; + const uint8_t ups_factor = 4; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + /*__m128i vshufbefore = _mm_setr_epi8( + 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b + );*/ + + __m256i vshufres = _mm256_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f + ); + + int32_t refline = *(int32_t*)ref; + //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); + //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); + __m128i vbehind = _mm_loadu_si128((__m128i*)src); + __m128i vbefore = vbehind; + + vbefore = _mm_slli_si128(vbefore, 4); // Shift left to make room for one 32-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore = _mm_insert_epi32(vbefore, refline, 0); + + __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); + __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + + /*vrow0 = _mm256_add_epi16(vrow0, vrnd); + vrow1 = _mm256_add_epi16(vrow1, vrnd); + vrow2 = _mm256_add_epi16(vrow2, vrnd);*/ + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256); + + vres0 = _mm256_shuffle_epi8(vres0, vshufres); + vres1 = _mm256_shuffle_epi8(vres1, vshufres); + + __m256i vlo128 = _mm256_permute2x128_si256(vres0, vres1, 0x20); + __m256i vhi128 = _mm256_permute2x128_si256(vres0, vres1, 0x31); + + vres0 = _mm256_permute4x64_epi64(vlo128, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vhi128, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + 0), vres0); + _mm256_store_si256((__m256i*)(dst + 32), vres1); + } /** \brief Matrix weighted intra prediction. @@ -6205,19 +6320,11 @@ void mip_predict_avx2( } } - // void uvg_mip_pred_upsampling_1D_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, - // const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, - // const uint16_t src_step, const uint8_t src_stride, - // const uint8_t dst_step, const uint8_t dst_stride, - // const uint8_t boundary_step, - // const uint8_t ups_factor) - uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: if (ups_ver_factor == 2) { - //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w4_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { @@ -6230,11 +6337,17 @@ void mip_predict_avx2( case 8: if (ups_ver_factor == 2) { - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - mip_upsampling_w8_ups2_ver_avx2(result, ver_src, ref_samples_top, height); + if (height == 8) { + mip_upsampling_w8_ups2_h8_ver_avx2(result, ver_src, ref_samples_top); + } + else { // Height == 16 + //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w8_ups2_h16_ver_avx2(result, ver_src, ref_samples_top); + } } else if (ups_ver_factor == 4) { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w8_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); From 0118a847beb95ab65b3e2f159bdd1d0d19f76aca Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 14:15:24 +0300 Subject: [PATCH 133/237] Implement w8 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 65 ++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index abee5494..00f3034c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6084,34 +6084,20 @@ static void mip_upsampling_w8_ups2_h16_ver_avx2(uvg_pixel* const dst, const uvg_ static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { - const uint8_t red_pred_size = 4; + const uint8_t red_pred_size = 8; const uint8_t ups_factor = 4; // height / red_pred_size const int log2_factor = uvg_g_convert_to_log2[ups_factor]; const int rounding_offset = 1 << (log2_factor - 1); - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - /*__m128i vshufbefore = _mm_setr_epi8( - 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b - );*/ - - __m256i vshufres = _mm256_setr_epi8( - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, - 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f - ); - int32_t refline = *(int32_t*)ref; - //__m128i vidx = _mm_setr_epi32(0, 32, 64, 96); - //__m128i vbehind = _mm_i32gather_epi32((const int*)src, vidx, 1); + int64_t refline = *(int64_t*)ref; __m128i vbehind = _mm_loadu_si128((__m128i*)src); __m128i vbefore = vbehind; - vbefore = _mm_slli_si128(vbefore, 4); // Shift left to make room for one 32-bit integer. This could be done with a shuffle, but there should be no performance difference. - vbefore = _mm_insert_epi32(vbefore, refline, 0); + vbefore = _mm_slli_si128(vbefore, 8); // Shift left to make room for one 64-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore = _mm_insert_epi64(vbefore, refline, 0); __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); @@ -6127,10 +6113,6 @@ static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); - /*vrow0 = _mm256_add_epi16(vrow0, vrnd); - vrow1 = _mm256_add_epi16(vrow1, vrnd); - vrow2 = _mm256_add_epi16(vrow2, vrnd);*/ - vrow0 = _mm256_srai_epi16(vrow0, log2_factor); vrow1 = _mm256_srai_epi16(vrow1, log2_factor); vrow2 = _mm256_srai_epi16(vrow2, log2_factor); @@ -6138,18 +6120,42 @@ static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256); - vres0 = _mm256_shuffle_epi8(vres0, vshufres); - vres1 = _mm256_shuffle_epi8(vres1, vshufres); - __m256i vlo128 = _mm256_permute2x128_si256(vres0, vres1, 0x20); __m256i vhi128 = _mm256_permute2x128_si256(vres0, vres1, 0x31); - vres0 = _mm256_permute4x64_epi64(vlo128, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vhi128, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)(dst + 0), vlo128); + _mm256_store_si256((__m256i*)(dst + 32), vhi128); - _mm256_store_si256((__m256i*)(dst + 0), vres0); - _mm256_store_si256((__m256i*)(dst + 32), vres1); + for (int i = 1; i < 4; ++i) { + vbefore = _mm_loadu_si128((__m128i*)(src + (i * 16 - 8))); + vbehind = _mm_loadu_si128((__m128i*)(src + (i * 16))); + vbefore256 = _mm256_cvtepu8_epi16(vbefore); + vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + + vres0 = _mm256_packus_epi16(vrow0, vrow1); + vres1 = _mm256_packus_epi16(vrow2, vbehind256); + + vlo128 = _mm256_permute2x128_si256(vres0, vres1, 0x20); + vhi128 = _mm256_permute2x128_si256(vres0, vres1, 0x31); + + _mm256_store_si256((__m256i*)(dst + (i * 64) + 0), vlo128); + _mm256_store_si256((__m256i*)(dst + (i * 64) + 32), vhi128); + } } /** \brief Matrix weighted intra prediction. @@ -6336,6 +6342,7 @@ void mip_predict_avx2( break; case 8: + // TODO: remove the if clauses and add a switch for height if (ups_ver_factor == 2) { if (height == 8) { mip_upsampling_w8_ups2_h8_ver_avx2(result, ver_src, ref_samples_top); From 5da45f9f3309618d6f4f2eabd7733759030989b8 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 15:13:35 +0300 Subject: [PATCH 134/237] Implement w16 vertical 1 to 2 upsample. --- src/strategies/avx2/intra-avx2.c | 159 ++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 00f3034c..7629731f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6091,7 +6091,6 @@ static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe const int rounding_offset = 1 << (log2_factor - 1); __m256i vrnd = _mm256_set1_epi16(rounding_offset); - int64_t refline = *(int64_t*)ref; __m128i vbehind = _mm_loadu_si128((__m128i*)src); __m128i vbefore = vbehind; @@ -6158,6 +6157,148 @@ static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe } } +static void mip_upsampling_w8_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 8; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + int64_t refline = *(int64_t*)ref; + __m128i vbehind = _mm_loadu_si128((__m128i*)src); + __m128i vbefore = vbehind; + + vbefore = _mm_slli_si128(vbefore, 8); // Shift left to make room for one 64-bit integer. This could be done with a shuffle, but there should be no performance difference. + vbefore = _mm_insert_epi64(vbefore, refline, 0); + + __m256i vbefore256 = _mm256_cvtepu8_epi16(vbefore); + __m256i vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + __m256i vrow3 = _mm256_add_epi16(vrow2, vinterpolate); + __m256i vrow4 = _mm256_add_epi16(vrow3, vinterpolate); + __m256i vrow5 = _mm256_add_epi16(vrow4, vinterpolate); + __m256i vrow6 = _mm256_add_epi16(vrow5, vinterpolate); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + vrow3 = _mm256_srai_epi16(vrow3, log2_factor); + vrow4 = _mm256_srai_epi16(vrow4, log2_factor); + vrow5 = _mm256_srai_epi16(vrow5, log2_factor); + vrow6 = _mm256_srai_epi16(vrow6, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vrow3); + __m256i vres2 = _mm256_packus_epi16(vrow4, vrow5); + __m256i vres3 = _mm256_packus_epi16(vrow6, vbehind256); + + __m256i vlo128a = _mm256_permute2x128_si256(vres0, vres1, 0x20); + __m256i vlo128b = _mm256_permute2x128_si256(vres2, vres3, 0x20); + __m256i vhi128a = _mm256_permute2x128_si256(vres0, vres1, 0x31); + __m256i vhi128b = _mm256_permute2x128_si256(vres2, vres3, 0x31); + + _mm256_store_si256((__m256i*)(dst + 0), vlo128a); + _mm256_store_si256((__m256i*)(dst + 32), vlo128b); + _mm256_store_si256((__m256i*)(dst + 64), vhi128a); + _mm256_store_si256((__m256i*)(dst + 96), vhi128b); + + for (int i = 1; i < 4; ++i) { + vbefore = _mm_loadu_si128((__m128i*)(src + (i * 16 - 8))); + vbehind = _mm_loadu_si128((__m128i*)(src + (i * 16))); + vbefore256 = _mm256_cvtepu8_epi16(vbefore); + vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + vrow3 = _mm256_add_epi16(vrow2, vinterpolate); + vrow4 = _mm256_add_epi16(vrow3, vinterpolate); + vrow5 = _mm256_add_epi16(vrow4, vinterpolate); + vrow6 = _mm256_add_epi16(vrow5, vinterpolate); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + vrow3 = _mm256_srai_epi16(vrow3, log2_factor); + vrow4 = _mm256_srai_epi16(vrow4, log2_factor); + vrow5 = _mm256_srai_epi16(vrow5, log2_factor); + vrow6 = _mm256_srai_epi16(vrow6, log2_factor); + + vres0 = _mm256_packus_epi16(vrow0, vrow1); + vres1 = _mm256_packus_epi16(vrow2, vrow3); + vres2 = _mm256_packus_epi16(vrow4, vrow5); + vres3 = _mm256_packus_epi16(vrow6, vbehind256); + + vlo128a = _mm256_permute2x128_si256(vres0, vres1, 0x20); + vlo128b = _mm256_permute2x128_si256(vres2, vres3, 0x20); + vhi128a = _mm256_permute2x128_si256(vres0, vres1, 0x31); + vhi128b = _mm256_permute2x128_si256(vres2, vres3, 0x31); + + _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vlo128a); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vlo128b); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vhi128a); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 96), vhi128b); + } +} + +static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + __m128i vbehind0 = _mm_loadu_si128((__m128i*)(src + 0)); + __m128i vbehind1 = _mm_loadu_si128((__m128i*)(src + 32)); + __m128i vbehind2 = _mm_loadu_si128((__m128i*)(src + 64)); + __m128i vbehind3 = _mm_loadu_si128((__m128i*)(src + 96)); + __m128i vbehind4 = _mm_loadu_si128((__m128i*)(src + 128)); + __m128i vbehind5 = _mm_loadu_si128((__m128i*)(src + 160)); + __m128i vbehind6 = _mm_loadu_si128((__m128i*)(src + 192)); + __m128i vbehind7 = _mm_loadu_si128((__m128i*)(src + 224)); + + __m128i vbefore0 = _mm_load_si128((__m128i*)ref); + __m128i vbefore1 = vbehind0; + __m128i vbefore2 = vbehind1; + __m128i vbefore3 = vbehind2; + __m128i vbefore4 = vbehind3; + __m128i vbefore5 = vbehind4; + __m128i vbefore6 = vbehind5; + __m128i vbefore7 = vbehind6; + + __m128i vavg0 = _mm_avg_epu8(vbefore0, vbehind0); + __m128i vavg1 = _mm_avg_epu8(vbefore1, vbehind1); + __m128i vavg2 = _mm_avg_epu8(vbefore2, vbehind2); + __m128i vavg3 = _mm_avg_epu8(vbefore3, vbehind3); + __m128i vavg4 = _mm_avg_epu8(vbefore4, vbehind4); + __m128i vavg5 = _mm_avg_epu8(vbefore5, vbehind5); + __m128i vavg6 = _mm_avg_epu8(vbefore6, vbehind6); + __m128i vavg7 = _mm_avg_epu8(vbefore7, vbehind7); + + _mm_store_si128((__m128i*)(dst + 0), vavg0); + _mm_store_si128((__m128i*)(dst + 32), vavg1); + _mm_store_si128((__m128i*)(dst + 64), vavg2); + _mm_store_si128((__m128i*)(dst + 96), vavg3); + _mm_store_si128((__m128i*)(dst + 128), vavg4); + _mm_store_si128((__m128i*)(dst + 160), vavg5); + _mm_store_si128((__m128i*)(dst + 192), vavg6); + _mm_store_si128((__m128i*)(dst + 224), vavg7); +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6353,14 +6494,26 @@ void mip_predict_avx2( } } else if (ups_ver_factor == 4) { - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w8_ups4_ver_avx2(result, ver_src, ref_samples_top); } + else { + mip_upsampling_w8_ups8_ver_avx2(result, ver_src, ref_samples_top); + } + break; + case 16: + if (ups_ver_factor == 2) { + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); + } + else if (ups_ver_factor == 4) { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); + } else { uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; - case 16: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; case 32: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; case 64: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; default: From 035af8607ea33dfb7850657f4a8e72c176bc4760 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 15:41:47 +0300 Subject: [PATCH 135/237] Implement w16 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7629731f..9e48fb1c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6299,6 +6299,53 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst + 224), vavg7); } +static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 4; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m256i vbefore256; + __m256i vbehind256; + + __m128i vbefore = _mm_load_si128((__m128i*)ref); + vbefore256 = _mm256_cvtepu8_epi16(vbefore); + + for (int i = 0; i < 8; ++i) { + __m128i vbehind = _mm_loadu_si128((__m128i*)(src + (i * 64))); + vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256); + + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (i * 64) + 0), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 64) + 32), vres1); + + vbefore256 = vbehind256; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6502,12 +6549,11 @@ void mip_predict_avx2( break; case 16: if (ups_ver_factor == 2) { - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); From 8548bd0ef363c312aed1fe6233f43b34acb499b6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 15:46:50 +0300 Subject: [PATCH 136/237] Implement w16 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 68 +++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9e48fb1c..38326dbd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6346,6 +6346,67 @@ static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 8; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m256i vbefore256; + __m256i vbehind256; + + __m128i vbefore = _mm_load_si128((__m128i*)ref); + vbefore256 = _mm256_cvtepu8_epi16(vbefore); + + for (int i = 0; i < 8; ++i) { + __m128i vbehind = _mm_loadu_si128((__m128i*)(src + (i * 128))); + vbehind256 = _mm256_cvtepu8_epi16(vbehind); + + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); + + __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); + __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); + __m256i vrow3 = _mm256_add_epi16(vrow2, vinterpolate); + __m256i vrow4 = _mm256_add_epi16(vrow3, vinterpolate); + __m256i vrow5 = _mm256_add_epi16(vrow4, vinterpolate); + __m256i vrow6 = _mm256_add_epi16(vrow5, vinterpolate); + + vrow0 = _mm256_srai_epi16(vrow0, log2_factor); + vrow1 = _mm256_srai_epi16(vrow1, log2_factor); + vrow2 = _mm256_srai_epi16(vrow2, log2_factor); + vrow3 = _mm256_srai_epi16(vrow3, log2_factor); + vrow4 = _mm256_srai_epi16(vrow4, log2_factor); + vrow5 = _mm256_srai_epi16(vrow5, log2_factor); + vrow6 = _mm256_srai_epi16(vrow6, log2_factor); + + __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vrow3); + __m256i vres2 = _mm256_packus_epi16(vrow4, vrow5); + __m256i vres3 = _mm256_packus_epi16(vrow6, vbehind256); + + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0)); + vres3 = _mm256_permute4x64_epi64(vres3, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres2); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 96), vres3); + + vbefore256 = vbehind256; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6514,7 +6575,7 @@ void mip_predict_avx2( } } - uvg_pixel tmp[64 * 64] = {0}; + //uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -6536,7 +6597,6 @@ void mip_predict_avx2( mip_upsampling_w8_ups2_h8_ver_avx2(result, ver_src, ref_samples_top); } else { // Height == 16 - //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w8_ups2_h16_ver_avx2(result, ver_src, ref_samples_top); } } @@ -6552,12 +6612,10 @@ void mip_predict_avx2( mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; case 32: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; From 6b5976e47eca984aa72ec90b265f034e8bbd2211 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 16:09:29 +0300 Subject: [PATCH 137/237] Implement w32 vertical 1 to 2 upsampling. --- src/strategies/avx2/intra-avx2.c | 34 ++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 38326dbd..912d57eb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6407,6 +6407,20 @@ static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + __m256i vbefore = _mm256_load_si256((__m256i*)ref); + + for (int i = 0; i < 8; ++i) { + __m256i vbehind = _mm256_load_si256((__m256i*)(src + (i * 64))); + __m256i vavg = _mm256_avg_epu8(vbefore, vbehind); + + _mm256_store_si256((__m256i*)(dst + (i * 64)), vavg); + + vbefore = vbehind; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6575,7 +6589,7 @@ void mip_predict_avx2( } } - //uvg_pixel tmp[64 * 64] = {0}; + uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -6607,6 +6621,7 @@ void mip_predict_avx2( mip_upsampling_w8_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; + case 16: if (ups_ver_factor == 2) { mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); @@ -6618,7 +6633,22 @@ void mip_predict_avx2( mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; - case 32: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + + case 32: + if (ups_ver_factor == 2) { + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w32_ups2_ver_avx2(result, ver_src, ref_samples_top); + } + else if (ups_ver_factor == 4) { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); + } + else { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //mip_upsampling_w32_ups8_ver_avx2(result, ver_src, ref_samples_top); + } + break; + case 64: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; default: assert(false && "Invalid MIP width.\n"); From f8a5c37eb5b3b4908d8b3ad72ffb98cfb90d71f9 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 16:42:34 +0300 Subject: [PATCH 138/237] Implement w32 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 101 ++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 912d57eb..12084547 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6421,6 +6421,102 @@ static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 4; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m256i vbefore256a; + __m256i vbehind256a; + + __m256i vbefore256b; + __m256i vbehind256b; + + __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); + __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); + vbefore256a = _mm256_cvtepu8_epi16(vbeforea); + vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); + + for (int i = 0; i < 8; ++i) { + __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 128) + 0)); + __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 128) + 16)); + vbehind256a = _mm256_cvtepu8_epi16(vbehinda); + vbehind256b = _mm256_cvtepu8_epi16(vbehindb); + + // Calculate left side of 32 wide lane + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); + + __m256i vrowleft0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowleft1 = _mm256_add_epi16(vrowleft0, vinterpolate); + __m256i vrowleft2 = _mm256_add_epi16(vrowleft1, vinterpolate); + + vrowleft0 = _mm256_srai_epi16(vrowleft0, log2_factor); + vrowleft1 = _mm256_srai_epi16(vrowleft1, log2_factor); + vrowleft2 = _mm256_srai_epi16(vrowleft2, log2_factor); + + /*__m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); + __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256a); + + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1);*/ + + + // Calculate right side of 32 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); + + __m256i vrowright0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowright1 = _mm256_add_epi16(vrowright0, vinterpolate); + __m256i vrowright2 = _mm256_add_epi16(vrowright1, vinterpolate); + + vrowright0 = _mm256_srai_epi16(vrowright0, log2_factor); + vrowright1 = _mm256_srai_epi16(vrowright1, log2_factor); + vrowright2 = _mm256_srai_epi16(vrowright2, log2_factor); + + /*vres0 = _mm256_packus_epi16(vrow0, vrow1); + vres1 = _mm256_packus_epi16(vrow2, vbehind256b); + + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 96), vres1);*/ + + __m256i vres0 = _mm256_packus_epi16(vrowleft0, vrowright0); + __m256i vres1 = _mm256_packus_epi16(vrowleft1, vrowright1); + __m256i vres2 = _mm256_packus_epi16(vrowleft2, vrowright2); + + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres2); + + vbefore256a = vbehind256a; + vbefore256b = vbehind256b; + } +} + + + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6636,12 +6732,11 @@ void mip_predict_avx2( case 32: if (ups_ver_factor == 2) { - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w32_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); + //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); From c141e5dfd9f74ec4ac2292860c19075f8f491674 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 16:49:09 +0300 Subject: [PATCH 139/237] Implement w32 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 128 ++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 21 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 12084547..501504aa 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6463,15 +6463,6 @@ static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vrowleft1 = _mm256_srai_epi16(vrowleft1, log2_factor); vrowleft2 = _mm256_srai_epi16(vrowleft2, log2_factor); - /*__m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); - __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256a); - - vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); - _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1);*/ - // Calculate right side of 32 wide lane vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); @@ -6489,15 +6480,8 @@ static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vrowright1 = _mm256_srai_epi16(vrowright1, log2_factor); vrowright2 = _mm256_srai_epi16(vrowright2, log2_factor); - /*vres0 = _mm256_packus_epi16(vrow0, vrow1); - vres1 = _mm256_packus_epi16(vrow2, vbehind256b); - - vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres0); - _mm256_store_si256((__m256i*)(dst + (i * 128) + 96), vres1);*/ + // Store results __m256i vres0 = _mm256_packus_epi16(vrowleft0, vrowright0); __m256i vres1 = _mm256_packus_epi16(vrowleft1, vrowright1); __m256i vres2 = _mm256_packus_epi16(vrowleft2, vrowright2); @@ -6515,7 +6499,111 @@ static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 8; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m256i vbefore256a; + __m256i vbehind256a; + + __m256i vbefore256b; + __m256i vbehind256b; + + __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); + __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); + vbefore256a = _mm256_cvtepu8_epi16(vbeforea); + vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); + + for (int i = 0; i < 8; ++i) { + __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 256) + 0)); + __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 256) + 16)); + vbehind256a = _mm256_cvtepu8_epi16(vbehinda); + vbehind256b = _mm256_cvtepu8_epi16(vbehindb); + + // Calculate left side of 32 wide lane + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); + __m256i vrowleft0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowleft1 = _mm256_add_epi16(vrowleft0, vinterpolate); + __m256i vrowleft2 = _mm256_add_epi16(vrowleft1, vinterpolate); + __m256i vrowleft3 = _mm256_add_epi16(vrowleft2, vinterpolate); + __m256i vrowleft4 = _mm256_add_epi16(vrowleft3, vinterpolate); + __m256i vrowleft5 = _mm256_add_epi16(vrowleft4, vinterpolate); + __m256i vrowleft6 = _mm256_add_epi16(vrowleft5, vinterpolate); + + vrowleft0 = _mm256_srai_epi16(vrowleft0, log2_factor); + vrowleft1 = _mm256_srai_epi16(vrowleft1, log2_factor); + vrowleft2 = _mm256_srai_epi16(vrowleft2, log2_factor); + vrowleft3 = _mm256_srai_epi16(vrowleft3, log2_factor); + vrowleft4 = _mm256_srai_epi16(vrowleft4, log2_factor); + vrowleft5 = _mm256_srai_epi16(vrowleft5, log2_factor); + vrowleft6 = _mm256_srai_epi16(vrowleft6, log2_factor); + + + // Calculate right side of 32 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); + + __m256i vrowright0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowright1 = _mm256_add_epi16(vrowright0, vinterpolate); + __m256i vrowright2 = _mm256_add_epi16(vrowright1, vinterpolate); + __m256i vrowright3 = _mm256_add_epi16(vrowright2, vinterpolate); + __m256i vrowright4 = _mm256_add_epi16(vrowright3, vinterpolate); + __m256i vrowright5 = _mm256_add_epi16(vrowright4, vinterpolate); + __m256i vrowright6 = _mm256_add_epi16(vrowright5, vinterpolate); + + vrowright0 = _mm256_srai_epi16(vrowright0, log2_factor); + vrowright1 = _mm256_srai_epi16(vrowright1, log2_factor); + vrowright2 = _mm256_srai_epi16(vrowright2, log2_factor); + vrowright3 = _mm256_srai_epi16(vrowright3, log2_factor); + vrowright4 = _mm256_srai_epi16(vrowright4, log2_factor); + vrowright5 = _mm256_srai_epi16(vrowright5, log2_factor); + vrowright6 = _mm256_srai_epi16(vrowright6, log2_factor); + + + // Store results + __m256i vres0 = _mm256_packus_epi16(vrowleft0, vrowright0); + __m256i vres1 = _mm256_packus_epi16(vrowleft1, vrowright1); + __m256i vres2 = _mm256_packus_epi16(vrowleft2, vrowright2); + __m256i vres3 = _mm256_packus_epi16(vrowleft3, vrowright3); + __m256i vres4 = _mm256_packus_epi16(vrowleft4, vrowright4); + __m256i vres5 = _mm256_packus_epi16(vrowleft5, vrowright5); + __m256i vres6 = _mm256_packus_epi16(vrowleft6, vrowright6); + + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0)); + vres3 = _mm256_permute4x64_epi64(vres3, _MM_SHUFFLE(3, 1, 2, 0)); + vres4 = _mm256_permute4x64_epi64(vres4, _MM_SHUFFLE(3, 1, 2, 0)); + vres5 = _mm256_permute4x64_epi64(vres5, _MM_SHUFFLE(3, 1, 2, 0)); + vres6 = _mm256_permute4x64_epi64(vres6, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (i * 256) + 0), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 32), vres1); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 64), vres2); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 96), vres3); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 128), vres4); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 160), vres5); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 192), vres6); + + vbefore256a = vbehind256a; + vbefore256b = vbehind256b; + } +} /** \brief Matrix weighted intra prediction. */ @@ -6685,7 +6773,7 @@ void mip_predict_avx2( } } - uvg_pixel tmp[64 * 64] = {0}; + //uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -6735,12 +6823,10 @@ void mip_predict_avx2( mip_upsampling_w32_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w32_ups8_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w32_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; From aa6f036127a13e76d0fcf76fd8bd1c8a76062e02 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 16:58:52 +0300 Subject: [PATCH 140/237] Implement w64 vertical 1 to 2 upsampling. --- src/strategies/avx2/intra-avx2.c | 37 ++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 501504aa..a6564f18 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6605,6 +6605,25 @@ static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + __m256i vbeforeleft = _mm256_load_si256((__m256i*)(ref + 0)); + __m256i vbeforeright = _mm256_load_si256((__m256i*)(ref + 32)); + + for (int i = 0; i < 8; ++i) { + __m256i vbehindleft = _mm256_load_si256((__m256i*)(src + (i * 128) + 0)); + __m256i vbehindright = _mm256_load_si256((__m256i*)(src + (i * 128) + 32)); + __m256i vavgleft = _mm256_avg_epu8(vbeforeleft, vbehindleft); + __m256i vavgright = _mm256_avg_epu8(vbeforeright, vbehindright); + + _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vavgleft); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vavgright); + + vbeforeleft = vbehindleft; + vbeforeright = vbehindright; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6773,7 +6792,7 @@ void mip_predict_avx2( } } - //uvg_pixel tmp[64 * 64] = {0}; + uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -6830,7 +6849,21 @@ void mip_predict_avx2( } break; - case 64: uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); break; + case 64: + if (ups_ver_factor == 2) { + uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w64_ups2_ver_avx2(result, ver_src, ref_samples_top); + } + else if (ups_ver_factor == 4) { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //mip_upsampling_w64_ups4_ver_avx2(result, ver_src, ref_samples_top); + } + else { + uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + //mip_upsampling_w64_ups8_ver_avx2(result, ver_src, ref_samples_top); + } + break; + default: assert(false && "Invalid MIP width.\n"); break; From 3c3156c32de48b2b394a7ed0f356b5967a94d0a3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 17:09:02 +0300 Subject: [PATCH 141/237] Implement w64 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 149 ++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a6564f18..87c06276 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6624,6 +6624,151 @@ static void mip_upsampling_w64_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 4; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m256i vbefore256a; + __m256i vbehind256a; + + __m256i vbefore256b; + __m256i vbehind256b; + + __m256i vbefore256c; + __m256i vbehind256c; + + __m256i vbefore256d; + __m256i vbehind256d; + + __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); + __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); + __m128i vbeforec = _mm_load_si128((__m128i*)(ref + 32)); + __m128i vbefored = _mm_load_si128((__m128i*)(ref + 48)); + vbefore256a = _mm256_cvtepu8_epi16(vbeforea); + vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); + vbefore256c = _mm256_cvtepu8_epi16(vbeforec); + vbefore256d = _mm256_cvtepu8_epi16(vbefored); + + for (int i = 0; i < 8; ++i) { + __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 256) + 0)); + __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 256) + 16)); + __m128i vbehindc = _mm_loadu_si128((__m128i*)(src + (i * 256) + 32)); + __m128i vbehindd = _mm_loadu_si128((__m128i*)(src + (i * 256) + 48)); + vbehind256a = _mm256_cvtepu8_epi16(vbehinda); + vbehind256b = _mm256_cvtepu8_epi16(vbehindb); + vbehind256c = _mm256_cvtepu8_epi16(vbehindc); + vbehind256d = _mm256_cvtepu8_epi16(vbehindd); + + // Calculate 1/4 part of 64 wide lane + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); + + __m256i vrowa0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowa1 = _mm256_add_epi16(vrowa0, vinterpolate); + __m256i vrowa2 = _mm256_add_epi16(vrowa1, vinterpolate); + + vrowa0 = _mm256_srai_epi16(vrowa0, log2_factor); + vrowa1 = _mm256_srai_epi16(vrowa1, log2_factor); + vrowa2 = _mm256_srai_epi16(vrowa2, log2_factor); + + + // Calculate 2/4 part of 64 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); + + __m256i vrowb0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowb1 = _mm256_add_epi16(vrowb0, vinterpolate); + __m256i vrowb2 = _mm256_add_epi16(vrowb1, vinterpolate); + + vrowb0 = _mm256_srai_epi16(vrowb0, log2_factor); + vrowb1 = _mm256_srai_epi16(vrowb1, log2_factor); + vrowb2 = _mm256_srai_epi16(vrowb2, log2_factor); + + + // Calculate 3/4 part of 64 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256c, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256c, vbefore256c); + + __m256i vrowc0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowc1 = _mm256_add_epi16(vrowc0, vinterpolate); + __m256i vrowc2 = _mm256_add_epi16(vrowc1, vinterpolate); + + vrowc0 = _mm256_srai_epi16(vrowc0, log2_factor); + vrowc1 = _mm256_srai_epi16(vrowc1, log2_factor); + vrowc2 = _mm256_srai_epi16(vrowc2, log2_factor); + + + // Calculate 3/4 part of 64 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256d, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256d, vbefore256d); + + __m256i vrowd0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowd1 = _mm256_add_epi16(vrowd0, vinterpolate); + __m256i vrowd2 = _mm256_add_epi16(vrowd1, vinterpolate); + + vrowd0 = _mm256_srai_epi16(vrowd0, log2_factor); + vrowd1 = _mm256_srai_epi16(vrowd1, log2_factor); + vrowd2 = _mm256_srai_epi16(vrowd2, log2_factor); + + + // Store results + __m256i vres0left = _mm256_packus_epi16(vrowa0, vrowb0); + __m256i vres0right = _mm256_packus_epi16(vrowc0, vrowd0); + __m256i vres1left = _mm256_packus_epi16(vrowa1, vrowb1); + __m256i vres1right = _mm256_packus_epi16(vrowc1, vrowd1); + __m256i vres2left = _mm256_packus_epi16(vrowa2, vrowb2); + __m256i vres2right = _mm256_packus_epi16(vrowc2, vrowd2); + + /*vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0));*/ + + vres0left = _mm256_permute4x64_epi64(vres0left, _MM_SHUFFLE(3, 1, 2, 0)); + vres0right = _mm256_permute4x64_epi64(vres0right, _MM_SHUFFLE(3, 1, 2, 0)); + vres1left = _mm256_permute4x64_epi64(vres1left, _MM_SHUFFLE(3, 1, 2, 0)); + vres1right = _mm256_permute4x64_epi64(vres1right, _MM_SHUFFLE(3, 1, 2, 0)); + vres2left = _mm256_permute4x64_epi64(vres2left, _MM_SHUFFLE(3, 1, 2, 0)); + vres2right = _mm256_permute4x64_epi64(vres2right, _MM_SHUFFLE(3, 1, 2, 0)); + + /*_mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1); + _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres2);*/ + + _mm256_store_si256((__m256i*)(dst + (i * 256) + 0), vres0left); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 32), vres0right); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 64), vres1left); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 96), vres1right); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 128), vres2left); + _mm256_store_si256((__m256i*)(dst + (i * 256) + 160), vres2right); + + vbefore256a = vbehind256a; + vbefore256b = vbehind256b; + vbefore256c = vbehind256c; + vbefore256d = vbehind256d; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6851,12 +6996,10 @@ void mip_predict_avx2( case 64: if (ups_ver_factor == 2) { - uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); mip_upsampling_w64_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w64_ups4_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w64_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); From 498e5ebeb640d9b61f8decea870489c8e28581ba Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 8 Apr 2024 17:38:41 +0300 Subject: [PATCH 142/237] Implement w64 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 208 ++++++++++++++++++++++++++++++- 1 file changed, 205 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 87c06276..59051da6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6769,6 +6769,208 @@ static void mip_upsampling_w64_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uint8_t red_pred_size = 8; + const uint8_t ups_factor = 8; // height / red_pred_size + + const int log2_factor = uvg_g_convert_to_log2[ups_factor]; + const int rounding_offset = 1 << (log2_factor - 1); + __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m256i vbefore256a; + __m256i vbehind256a; + + __m256i vbefore256b; + __m256i vbehind256b; + + __m256i vbefore256c; + __m256i vbehind256c; + + __m256i vbefore256d; + __m256i vbehind256d; + + __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); + __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); + __m128i vbeforec = _mm_load_si128((__m128i*)(ref + 32)); + __m128i vbefored = _mm_load_si128((__m128i*)(ref + 48)); + vbefore256a = _mm256_cvtepu8_epi16(vbeforea); + vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); + vbefore256c = _mm256_cvtepu8_epi16(vbeforec); + vbefore256d = _mm256_cvtepu8_epi16(vbefored); + + for (int i = 0; i < 8; ++i) { + __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 512) + 0)); + __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 512) + 16)); + __m128i vbehindc = _mm_loadu_si128((__m128i*)(src + (i * 512) + 32)); + __m128i vbehindd = _mm_loadu_si128((__m128i*)(src + (i * 512) + 48)); + vbehind256a = _mm256_cvtepu8_epi16(vbehinda); + vbehind256b = _mm256_cvtepu8_epi16(vbehindb); + vbehind256c = _mm256_cvtepu8_epi16(vbehindc); + vbehind256d = _mm256_cvtepu8_epi16(vbehindd); + + // Calculate 1/4 part of 64 wide lane + __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); + + __m256i vrowa0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowa1 = _mm256_add_epi16(vrowa0, vinterpolate); + __m256i vrowa2 = _mm256_add_epi16(vrowa1, vinterpolate); + __m256i vrowa3 = _mm256_add_epi16(vrowa2, vinterpolate); + __m256i vrowa4 = _mm256_add_epi16(vrowa3, vinterpolate); + __m256i vrowa5 = _mm256_add_epi16(vrowa4, vinterpolate); + __m256i vrowa6 = _mm256_add_epi16(vrowa5, vinterpolate); + + vrowa0 = _mm256_srai_epi16(vrowa0, log2_factor); + vrowa1 = _mm256_srai_epi16(vrowa1, log2_factor); + vrowa2 = _mm256_srai_epi16(vrowa2, log2_factor); + vrowa3 = _mm256_srai_epi16(vrowa3, log2_factor); + vrowa4 = _mm256_srai_epi16(vrowa4, log2_factor); + vrowa5 = _mm256_srai_epi16(vrowa5, log2_factor); + vrowa6 = _mm256_srai_epi16(vrowa6, log2_factor); + + + // Calculate 2/4 part of 64 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); + + __m256i vrowb0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowb1 = _mm256_add_epi16(vrowb0, vinterpolate); + __m256i vrowb2 = _mm256_add_epi16(vrowb1, vinterpolate); + __m256i vrowb3 = _mm256_add_epi16(vrowb2, vinterpolate); + __m256i vrowb4 = _mm256_add_epi16(vrowb3, vinterpolate); + __m256i vrowb5 = _mm256_add_epi16(vrowb4, vinterpolate); + __m256i vrowb6 = _mm256_add_epi16(vrowb5, vinterpolate); + + vrowb0 = _mm256_srai_epi16(vrowb0, log2_factor); + vrowb1 = _mm256_srai_epi16(vrowb1, log2_factor); + vrowb2 = _mm256_srai_epi16(vrowb2, log2_factor); + vrowb3 = _mm256_srai_epi16(vrowb3, log2_factor); + vrowb4 = _mm256_srai_epi16(vrowb4, log2_factor); + vrowb5 = _mm256_srai_epi16(vrowb5, log2_factor); + vrowb6 = _mm256_srai_epi16(vrowb6, log2_factor); + + + // Calculate 3/4 part of 64 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256c, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256c, vbefore256c); + + __m256i vrowc0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowc1 = _mm256_add_epi16(vrowc0, vinterpolate); + __m256i vrowc2 = _mm256_add_epi16(vrowc1, vinterpolate); + __m256i vrowc3 = _mm256_add_epi16(vrowc2, vinterpolate); + __m256i vrowc4 = _mm256_add_epi16(vrowc3, vinterpolate); + __m256i vrowc5 = _mm256_add_epi16(vrowc4, vinterpolate); + __m256i vrowc6 = _mm256_add_epi16(vrowc5, vinterpolate); + + vrowc0 = _mm256_srai_epi16(vrowc0, log2_factor); + vrowc1 = _mm256_srai_epi16(vrowc1, log2_factor); + vrowc2 = _mm256_srai_epi16(vrowc2, log2_factor); + vrowc3 = _mm256_srai_epi16(vrowc3, log2_factor); + vrowc4 = _mm256_srai_epi16(vrowc4, log2_factor); + vrowc5 = _mm256_srai_epi16(vrowc5, log2_factor); + vrowc6 = _mm256_srai_epi16(vrowc6, log2_factor); + + + // Calculate 3/4 part of 64 wide lane + vbeforeshifted = _mm256_slli_epi16(vbefore256d, log2_factor); + + // Add rounding offset + vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); + + vinterpolate = _mm256_sub_epi16(vbehind256d, vbefore256d); + + __m256i vrowd0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); + __m256i vrowd1 = _mm256_add_epi16(vrowd0, vinterpolate); + __m256i vrowd2 = _mm256_add_epi16(vrowd1, vinterpolate); + __m256i vrowd3 = _mm256_add_epi16(vrowd2, vinterpolate); + __m256i vrowd4 = _mm256_add_epi16(vrowd3, vinterpolate); + __m256i vrowd5 = _mm256_add_epi16(vrowd4, vinterpolate); + __m256i vrowd6 = _mm256_add_epi16(vrowd5, vinterpolate); + + vrowd0 = _mm256_srai_epi16(vrowd0, log2_factor); + vrowd1 = _mm256_srai_epi16(vrowd1, log2_factor); + vrowd2 = _mm256_srai_epi16(vrowd2, log2_factor); + vrowd3 = _mm256_srai_epi16(vrowd3, log2_factor); + vrowd4 = _mm256_srai_epi16(vrowd4, log2_factor); + vrowd5 = _mm256_srai_epi16(vrowd5, log2_factor); + vrowd6 = _mm256_srai_epi16(vrowd6, log2_factor); + + + // Store results + __m256i vres00 = _mm256_packus_epi16(vrowa0, vrowb0); + __m256i vres01 = _mm256_packus_epi16(vrowc0, vrowd0); + + __m256i vres10 = _mm256_packus_epi16(vrowa1, vrowb1); + __m256i vres11 = _mm256_packus_epi16(vrowc1, vrowd1); + + __m256i vres20 = _mm256_packus_epi16(vrowa2, vrowb2); + __m256i vres21 = _mm256_packus_epi16(vrowc2, vrowd2); + + __m256i vres30 = _mm256_packus_epi16(vrowa3, vrowb3); + __m256i vres31 = _mm256_packus_epi16(vrowc3, vrowd3); + + __m256i vres40 = _mm256_packus_epi16(vrowa4, vrowb4); + __m256i vres41 = _mm256_packus_epi16(vrowc4, vrowd4); + + __m256i vres50 = _mm256_packus_epi16(vrowa5, vrowb5); + __m256i vres51 = _mm256_packus_epi16(vrowc5, vrowd5); + + __m256i vres60 = _mm256_packus_epi16(vrowa6, vrowb6); + __m256i vres61 = _mm256_packus_epi16(vrowc6, vrowd6); + + + vres00 = _mm256_permute4x64_epi64(vres00, _MM_SHUFFLE(3, 1, 2, 0)); + vres01 = _mm256_permute4x64_epi64(vres01, _MM_SHUFFLE(3, 1, 2, 0)); + vres10 = _mm256_permute4x64_epi64(vres10, _MM_SHUFFLE(3, 1, 2, 0)); + vres11 = _mm256_permute4x64_epi64(vres11, _MM_SHUFFLE(3, 1, 2, 0)); + vres20 = _mm256_permute4x64_epi64(vres20, _MM_SHUFFLE(3, 1, 2, 0)); + vres21 = _mm256_permute4x64_epi64(vres21, _MM_SHUFFLE(3, 1, 2, 0)); + vres30 = _mm256_permute4x64_epi64(vres30, _MM_SHUFFLE(3, 1, 2, 0)); + vres31 = _mm256_permute4x64_epi64(vres31, _MM_SHUFFLE(3, 1, 2, 0)); + vres40 = _mm256_permute4x64_epi64(vres40, _MM_SHUFFLE(3, 1, 2, 0)); + vres41 = _mm256_permute4x64_epi64(vres41, _MM_SHUFFLE(3, 1, 2, 0)); + vres50 = _mm256_permute4x64_epi64(vres50, _MM_SHUFFLE(3, 1, 2, 0)); + vres51 = _mm256_permute4x64_epi64(vres51, _MM_SHUFFLE(3, 1, 2, 0)); + vres60 = _mm256_permute4x64_epi64(vres60, _MM_SHUFFLE(3, 1, 2, 0)); + vres61 = _mm256_permute4x64_epi64(vres61, _MM_SHUFFLE(3, 1, 2, 0)); + + + _mm256_store_si256((__m256i*)(dst + (i * 512) + 0), vres00); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 32), vres01); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 64), vres10); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 96), vres11); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 128), vres20); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 160), vres21); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 192), vres30); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 224), vres31); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 256), vres40); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 288), vres41); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 320), vres50); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 352), vres51); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 384), vres60); + _mm256_store_si256((__m256i*)(dst + (i * 512) + 416), vres61); + + + vbefore256a = vbehind256a; + vbefore256b = vbehind256b; + vbefore256c = vbehind256c; + vbefore256d = vbehind256d; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -6937,7 +7139,7 @@ void mip_predict_avx2( } } - uvg_pixel tmp[64 * 64] = {0}; + //uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -7002,8 +7204,8 @@ void mip_predict_avx2( mip_upsampling_w64_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - uvg_mip_pred_upsampling_1D_ver_avx2(result, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w64_ups8_ver_avx2(result, ver_src, ref_samples_top); + //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); + mip_upsampling_w64_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; From ec706e15ba0c3f193d702532b6fd041a3112f1e3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 9 Apr 2024 15:05:48 +0300 Subject: [PATCH 143/237] Implement alternate version of w64 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 87 +++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 59051da6..b49554c0 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5827,6 +5827,8 @@ static void mip_upsampling_w64_ups8_hor_avx2(uvg_pixel* const dst, const uvg_pix } + + static void mip_upsampling_w4_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 4; @@ -6769,6 +6771,86 @@ static void mip_upsampling_w64_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + __m256i vbeforeleft = _mm256_load_si256((__m256i*)(ref + 0)); + __m256i vbeforeright = _mm256_load_si256((__m256i*)(ref + 32)); + + __m256i zeros = _mm256_setzero_si256(); + __m256i ones = _mm256_set1_epi8(1); + __m256i threes = _mm256_set1_epi8(3); + __m256i permute_mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + + for (int i = 0; i < 8; ++i) { + // Calculate 4 lines at a time + __m256i vbehindleft = _mm256_load_si256((__m256i*)(src_ptr + 0)); + __m256i vbehindright = _mm256_load_si256((__m256i*)(src_ptr + 32)); + + // Calculate left side of 64 wide lane + // Calculate the 3 interpolated lines between before and behind. Top row, middle row and bottom row. + __m256i vmiddleleft = _mm256_avg_epu8(vbeforeleft, vbehindleft); + __m256i vtopleft = _mm256_avg_epu8(vbeforeleft, vmiddleleft); + __m256i vbottomleft = _mm256_avg_epu8(vmiddleleft, vbehindleft); + + // Calculate the two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + // Rounding error occurs in the left interpolated value if the two last bits of the difference between before and behind is 0b01. + __m256i diff = _mm256_sub_epi8(vbehindleft, vbeforeleft); + diff = _mm256_and_si256(diff, threes); + __m256i mask = _mm256_cmpeq_epi8(diff, ones); // The rounding error mask will be generated based on the calculated last bits. + __m256i sub_amount = _mm256_blendv_epi8(zeros, ones, mask); + + vtopleft = _mm256_sub_epi8(vtopleft, sub_amount); + + // Same rounding error handling for bottom interpolated values. + // Error happens if the two last bits of the difference between before and behind is 0b11. + mask = _mm256_cmpeq_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); + + vbottomleft = _mm256_sub_epi8(vbottomleft, sub_amount); + + + // Calculate right side of 64 wide lane + // Calculate the 3 interpolated lines between before and behind. Top row, middle row and bottom row. + __m256i vmiddleright = _mm256_avg_epu8(vbeforeright, vbehindright); + __m256i vtopright = _mm256_avg_epu8(vbeforeright, vmiddleright); + __m256i vbottomright = _mm256_avg_epu8(vmiddleright, vbehindright); + + // Calculate the two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + // Rounding error occurs in the right interpolated value if the two last bits of the difference between before and behind is 0b01. + diff = _mm256_sub_epi8(vbehindright, vbeforeright); + diff = _mm256_and_si256(diff, threes); + mask = _mm256_cmpeq_epi8(diff, ones); // The rounding error mask will be generated based on the calculated last bits. + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); + + vtopright = _mm256_sub_epi8(vtopright, sub_amount); + + // Same rounding error handling for bottom interpolated values. + // Error happens if the two last bits of the difference between before and behind is 0b11. + mask = _mm256_cmpeq_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); + + vbottomright = _mm256_sub_epi8(vbottomright, sub_amount); + + // Store results + _mm256_store_si256((__m256i*)(dst_ptr + 0), vtopleft); + _mm256_store_si256((__m256i*)(dst_ptr + 32), vtopright); + _mm256_store_si256((__m256i*)(dst_ptr + 64), vmiddleleft); + _mm256_store_si256((__m256i*)(dst_ptr + 96), vmiddleright); + _mm256_store_si256((__m256i*)(dst_ptr + 128), vbottomleft); + _mm256_store_si256((__m256i*)(dst_ptr + 160), vbottomright); + // No need to store the last line of the 4 lines as it is already present in the result array and it was not modified in any way. + + vbeforeleft = vbehindleft; + vbeforeright = vbehindright; + + dst_ptr += 256; + src_ptr += 256; + } +} + static void mip_upsampling_w64_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -7139,7 +7221,7 @@ void mip_predict_avx2( } } - //uvg_pixel tmp[64 * 64] = {0}; + uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -7201,7 +7283,8 @@ void mip_predict_avx2( mip_upsampling_w64_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w64_ups4_ver_avx2(result, ver_src, ref_samples_top); + //mip_upsampling_w64_ups4_ver_avx2(tmp, ver_src, ref_samples_top); + mip_upsampling_w64_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); } else { //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); From 0ecd13dd3b7682e39b48cca6253fb7ee8e050f0a Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 9 Apr 2024 16:40:58 +0300 Subject: [PATCH 144/237] Implement alternate version of w64 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 156 +++++++++++++++++++++++++++++-- 1 file changed, 150 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b49554c0..6820f7ae 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6779,10 +6779,9 @@ static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg __m256i vbeforeleft = _mm256_load_si256((__m256i*)(ref + 0)); __m256i vbeforeright = _mm256_load_si256((__m256i*)(ref + 32)); - __m256i zeros = _mm256_setzero_si256(); - __m256i ones = _mm256_set1_epi8(1); - __m256i threes = _mm256_set1_epi8(3); - __m256i permute_mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + const __m256i zeros = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi8(1); + const __m256i threes = _mm256_set1_epi8(3); for (int i = 0; i < 8; ++i) { // Calculate 4 lines at a time @@ -7053,6 +7052,150 @@ static void mip_upsampling_w64_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w64_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + const __m256i zeros = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi8(1); + const __m256i twos = _mm256_set1_epi8(2); + const __m256i threes = _mm256_set1_epi8(3); + const __m256i fours = _mm256_set1_epi8(4); + const __m256i fives = _mm256_set1_epi8(5); + const __m256i sixes = _mm256_set1_epi8(6); + const __m256i sevens = _mm256_set1_epi8(7); + const __m256i eights = _mm256_set1_epi8(8); + + __m256i vbeforeleft = _mm256_load_si256((__m256i*)(ref + 0)); + __m256i vbeforeright = _mm256_load_si256((__m256i*)(ref + 32)); + + for (int i = 0; i < 8; ++i) { + __m256i vbehindleft = _mm256_load_si256((__m256i*)(src_ptr + 0)); + __m256i vbehindright = _mm256_load_si256((__m256i*)(src_ptr + 32)); + + // Calculate left side of 64 wide lane. + // Calculate the 7 interpolated lines between before and behind. Ordered by number from top to bottom. + __m256i vleft3 = _mm256_avg_epu8(vbeforeleft, vbehindleft); // Middle + __m256i vleft1 = _mm256_avg_epu8(vleft3, vbeforeleft); // Top middle + __m256i vleft5 = _mm256_avg_epu8(vleft3, vbehindleft); // Bottom middle + __m256i vleft0 = _mm256_avg_epu8(vbeforeleft, vleft1); // Top middle top + __m256i vleft2 = _mm256_avg_epu8(vleft1, vleft3); // Top middle bottom + __m256i vleft4 = _mm256_avg_epu8(vleft3, vleft5); // Bottom middle top + __m256i vleft6 = _mm256_avg_epu8(vleft5, vbehindleft); // Bottom middle bottom + + // Calculate the three and two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + __m256i diff = _mm256_sub_epi8(vbehindleft, vbeforeleft); + diff = _mm256_and_si256(diff, sevens); + __m256i three_diff = _mm256_and_si256(diff, threes); + + // Bottom side + __m256i mask = _mm256_cmpgt_epi8(diff, fours); // The rounding error mask will be generated based on the calculated last bits. + __m256i sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 5, 6, 7 select one + vleft6 = _mm256_sub_epi8(vleft6, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, threes); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 3 or 7 select one + vleft5 = _mm256_sub_epi8(vleft5, sub_amount); + + __m256i is_two = _mm256_cmpeq_epi8(diff, twos); + __m256i is_five = _mm256_cmpeq_epi8(diff, fives); + mask = _mm256_or_si256(mask, is_two); + mask = _mm256_or_si256(mask, is_five); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 2, 3, 5, or 7 select one + vleft4 = _mm256_sub_epi8(vleft4, sub_amount); + + // Top side + diff = _mm256_blendv_epi8(diff, eights, _mm256_cmpeq_epi8(zeros, diff)); // Replace zeros with eights to enable using GT + mask = _mm256_cmpgt_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(ones, zeros, mask); // If greater than three select zero + vleft0 = _mm256_sub_epi8(vleft0, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, ones); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 1 or 5 select one + vleft1 = _mm256_sub_epi8(vleft1, sub_amount); + + __m256i is_three = _mm256_cmpeq_epi8(diff, threes); + __m256i is_six = _mm256_cmpeq_epi8(diff, sixes); + mask = _mm256_or_si256(mask, is_three); + mask = _mm256_or_si256(mask, is_six); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 1, 3, 5, 6 select one + vleft2 = _mm256_sub_epi8(vleft2, sub_amount); + + + // Calculate right side of 64 wide lane. + // Calculate the 7 interpolated lines between before and behind. Ordered by number from top to bottom. + __m256i vright3 = _mm256_avg_epu8(vbeforeright, vbehindright); // Middle + __m256i vright1 = _mm256_avg_epu8(vright3, vbeforeright); // Top middle + __m256i vright5 = _mm256_avg_epu8(vright3, vbehindright); // Bottom middle + __m256i vright0 = _mm256_avg_epu8(vbeforeright, vright1); // Top middle top + __m256i vright2 = _mm256_avg_epu8(vright1, vright3); // Top middle bottom + __m256i vright4 = _mm256_avg_epu8(vright3, vright5); // Bottom middle top + __m256i vright6 = _mm256_avg_epu8(vright5, vbehindright); // Bottom middle bottom + + // Calculate the three and two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + diff = _mm256_sub_epi8(vbehindright, vbeforeright); + diff = _mm256_and_si256(diff, sevens); + three_diff = _mm256_and_si256(diff, threes); + + // Bottom side + mask = _mm256_cmpgt_epi8(diff, fours); // The rounding error mask will be generated based on the calculated last bits. + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 5, 6, 7 select one + vright6 = _mm256_sub_epi8(vright6, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, threes); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 3 or 7 select one + vright5 = _mm256_sub_epi8(vright5, sub_amount); + + is_two = _mm256_cmpeq_epi8(diff, twos); + is_five = _mm256_cmpeq_epi8(diff, fives); + mask = _mm256_or_si256(mask, is_two); + mask = _mm256_or_si256(mask, is_five); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 2, 3, 5, or 7 select one + vright4 = _mm256_sub_epi8(vright4, sub_amount); + + // Top side + diff = _mm256_blendv_epi8(diff, eights, _mm256_cmpeq_epi8(zeros, diff)); // Replace zeros with eights to enable using GT + mask = _mm256_cmpgt_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(ones, zeros, mask); // If greater than three select zero + vright0 = _mm256_sub_epi8(vright0, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, ones); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 1 or 5 select one + vright1 = _mm256_sub_epi8(vright1, sub_amount); + + is_three = _mm256_cmpeq_epi8(diff, threes); + is_six = _mm256_cmpeq_epi8(diff, sixes); + mask = _mm256_or_si256(mask, is_three); + mask = _mm256_or_si256(mask, is_six); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 1, 3, 5, 6 select one + vright2 = _mm256_sub_epi8(vright2, sub_amount); + + + // Store results + _mm256_store_si256((__m256i*)(dst_ptr + 0), vleft0); + _mm256_store_si256((__m256i*)(dst_ptr + 32), vright0); + _mm256_store_si256((__m256i*)(dst_ptr + 64), vleft1); + _mm256_store_si256((__m256i*)(dst_ptr + 96), vright1); + _mm256_store_si256((__m256i*)(dst_ptr + 128), vleft2); + _mm256_store_si256((__m256i*)(dst_ptr + 160), vright2); + _mm256_store_si256((__m256i*)(dst_ptr + 192), vleft3); + _mm256_store_si256((__m256i*)(dst_ptr + 224), vright3); + _mm256_store_si256((__m256i*)(dst_ptr + 256), vleft4); + _mm256_store_si256((__m256i*)(dst_ptr + 288), vright4); + _mm256_store_si256((__m256i*)(dst_ptr + 320), vleft5); + _mm256_store_si256((__m256i*)(dst_ptr + 352), vright5); + _mm256_store_si256((__m256i*)(dst_ptr + 384), vleft6); + _mm256_store_si256((__m256i*)(dst_ptr + 416), vright6); + + vbeforeleft = vbehindleft; + vbeforeright = vbehindright; + + dst_ptr += 512; + src_ptr += 512; + } +} + /** \brief Matrix weighted intra prediction. */ void mip_predict_avx2( @@ -7221,7 +7364,7 @@ void mip_predict_avx2( } } - uvg_pixel tmp[64 * 64] = {0}; + //uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -7288,7 +7431,8 @@ void mip_predict_avx2( } else { //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - mip_upsampling_w64_ups8_ver_avx2(result, ver_src, ref_samples_top); + //mip_upsampling_w64_ups8_ver_avx2(tmp, ver_src, ref_samples_top); + mip_upsampling_w64_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); } break; From 71efa7805a8d29ca3fbe0e04eb0aaac247909354 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 10 Apr 2024 13:47:25 +0300 Subject: [PATCH 145/237] Implement alternate version of w32 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 95 ++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 6820f7ae..d15ada22 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6083,7 +6083,7 @@ static void mip_upsampling_w8_ups2_h16_ver_avx2(uvg_pixel* const dst, const uvg_ _mm_store_si128((__m128i*)(dst + 96), vres2); _mm_store_si128((__m128i*)(dst + 112), vres3); } - +// static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6300,7 +6300,7 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst + 192), vavg6); _mm_store_si128((__m128i*)(dst + 224), vavg7); } - +// static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6347,7 +6347,7 @@ static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vbefore256 = vbehind256; } } - +// static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6422,7 +6422,7 @@ static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix vbefore = vbehind; } } - +// static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6500,7 +6500,7 @@ static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vbefore256b = vbehind256b; } } - +// static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6607,6 +6607,88 @@ static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w32_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + const __m256i zeros = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi8(1); + const __m256i twos = _mm256_set1_epi8(2); + const __m256i threes = _mm256_set1_epi8(3); + const __m256i fours = _mm256_set1_epi8(4); + const __m256i fives = _mm256_set1_epi8(5); + const __m256i sixes = _mm256_set1_epi8(6); + const __m256i sevens = _mm256_set1_epi8(7); + const __m256i eights = _mm256_set1_epi8(8); + + __m256i vbefore = _mm256_load_si256((__m256i*)(ref + 0)); + + for (int i = 0; i < 8; ++i) { + __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + + // Calculate the 7 interpolated lines between before and behind. Ordered by number from top to bottom. + __m256i vrow3 = _mm256_avg_epu8(vbefore, vbehind); // Middle + __m256i vrow1 = _mm256_avg_epu8(vrow3, vbefore); // Top middle + __m256i vrow5 = _mm256_avg_epu8(vrow3, vbehind); // Bottom middle + __m256i vrow0 = _mm256_avg_epu8(vbefore, vrow1); // Top middle top + __m256i vrow2 = _mm256_avg_epu8(vrow1, vrow3); // Top middle bottom + __m256i vrow4 = _mm256_avg_epu8(vrow3, vrow5); // Bottom middle top + __m256i vrow6 = _mm256_avg_epu8(vrow5, vbehind); // Bottom middle bottom + + // Calculate the three and two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + __m256i diff = _mm256_sub_epi8(vbehind, vbefore); + diff = _mm256_and_si256(diff, sevens); + __m256i three_diff = _mm256_and_si256(diff, threes); + + // Bottom side + __m256i mask = _mm256_cmpgt_epi8(diff, fours); // The rounding error mask will be generated based on the calculated last bits. + __m256i sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 5, 6, 7 select one + vrow6 = _mm256_sub_epi8(vrow6, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, threes); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 3 or 7 select one + vrow5 = _mm256_sub_epi8(vrow5, sub_amount); + + __m256i is_two = _mm256_cmpeq_epi8(diff, twos); + __m256i is_five = _mm256_cmpeq_epi8(diff, fives); + mask = _mm256_or_si256(mask, is_two); + mask = _mm256_or_si256(mask, is_five); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 2, 3, 5, or 7 select one + vrow4 = _mm256_sub_epi8(vrow4, sub_amount); + + // Top side + diff = _mm256_blendv_epi8(diff, eights, _mm256_cmpeq_epi8(zeros, diff)); // Replace zeros with eights to enable using GT + mask = _mm256_cmpgt_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(ones, zeros, mask); // If greater than three select zero + vrow0 = _mm256_sub_epi8(vrow0, sub_amount); + + mask = _mm256_cmpeq_epi8(three_diff, ones); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 1 or 5 select one + vrow1 = _mm256_sub_epi8(vrow1, sub_amount); + + __m256i is_three = _mm256_cmpeq_epi8(diff, threes); + __m256i is_six = _mm256_cmpeq_epi8(diff, sixes); + mask = _mm256_or_si256(mask, is_three); + mask = _mm256_or_si256(mask, is_six); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); // If 1, 3, 5, 6 select one + vrow2 = _mm256_sub_epi8(vrow2, sub_amount); + + // Store results + _mm256_store_si256((__m256i*)(dst_ptr + 0), vrow0); + _mm256_store_si256((__m256i*)(dst_ptr + 32), vrow1); + _mm256_store_si256((__m256i*)(dst_ptr + 64), vrow2); + _mm256_store_si256((__m256i*)(dst_ptr + 96), vrow3); + _mm256_store_si256((__m256i*)(dst_ptr + 128), vrow4); + _mm256_store_si256((__m256i*)(dst_ptr + 160), vrow5); + _mm256_store_si256((__m256i*)(dst_ptr + 192), vrow6); + + vbefore = vbehind; + src_ptr += 256; + dst_ptr += 256; + } +} + static void mip_upsampling_w64_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m256i vbeforeleft = _mm256_load_si256((__m256i*)(ref + 0)); @@ -7417,7 +7499,8 @@ void mip_predict_avx2( mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - mip_upsampling_w32_ups8_ver_avx2(result, ver_src, ref_samples_top); + //mip_upsampling_w32_ups8_ver_avx2(tmp, ver_src, ref_samples_top); + mip_upsampling_w32_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); } break; From ea6feb13fc1d02afd688018be6a23fc461fb6bcf Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 10 Apr 2024 13:58:25 +0300 Subject: [PATCH 146/237] Implement alternate version of w32 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d15ada22..353906bf 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6422,7 +6422,7 @@ static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix vbefore = vbehind; } } -// + static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6500,7 +6500,53 @@ static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vbefore256b = vbehind256b; } } -// + +static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + __m256i vbefore = _mm256_load_si256((__m256i*)ref); + + const __m256i zeros = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi8(1); + const __m256i threes = _mm256_set1_epi8(3); + + for (int i = 0; i < 8; ++i) { + __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + + // Calculate the 3 interpolated lines between before and behind. Top row, middle row and bottom row. + __m256i vmiddle = _mm256_avg_epu8(vbefore, vbehind); + __m256i vtop = _mm256_avg_epu8(vbefore, vmiddle); + __m256i vbottom = _mm256_avg_epu8(vmiddle, vbehind); + + // Calculate the two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + // Rounding error occurs in the left interpolated value if the two last bits of the difference between before and behind is 0b01. + __m256i diff = _mm256_sub_epi8(vbehind, vbefore); + diff = _mm256_and_si256(diff, threes); + __m256i mask = _mm256_cmpeq_epi8(diff, ones); // The rounding error mask will be generated based on the calculated last bits. + __m256i sub_amount = _mm256_blendv_epi8(zeros, ones, mask); + + vtop = _mm256_sub_epi8(vtop, sub_amount); + + // Same rounding error handling for bottom interpolated values. + // Error happens if the two last bits of the difference between before and behind is 0b11. + mask = _mm256_cmpeq_epi8(diff, threes); + sub_amount = _mm256_blendv_epi8(zeros, ones, mask); + + vbottom = _mm256_sub_epi8(vbottom, sub_amount); + + // Store results + _mm256_store_si256((__m256i*)(dst_ptr + 0), vtop); + _mm256_store_si256((__m256i*)(dst_ptr + 32), vmiddle); + _mm256_store_si256((__m256i*)(dst_ptr + 64), vbottom); + + vbefore = vbehind; + src_ptr += 128; + dst_ptr += 128; + } +} + static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -7496,7 +7542,7 @@ void mip_predict_avx2( mip_upsampling_w32_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w32_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); } else { //mip_upsampling_w32_ups8_ver_avx2(tmp, ver_src, ref_samples_top); From 7d54cb68f7799ed14cf019d86fdaee164963c57d Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 10 Apr 2024 16:18:04 +0300 Subject: [PATCH 147/237] Implement alternate w16 vertical 1 to 8 upsampling. --- src/strategies/avx2/intra-avx2.c | 88 +++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 353906bf..c24011de 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6347,7 +6347,7 @@ static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vbefore256 = vbehind256; } } -// + static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6409,6 +6409,90 @@ static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w16_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + const __m128i zeros = _mm_setzero_si128(); + const __m128i ones = _mm_set1_epi8(1); + const __m128i twos = _mm_set1_epi8(2); + const __m128i threes = _mm_set1_epi8(3); + const __m128i fours = _mm_set1_epi8(4); + const __m128i fives = _mm_set1_epi8(5); + const __m128i sixes = _mm_set1_epi8(6); + const __m128i sevens = _mm_set1_epi8(7); + const __m128i eights = _mm_set1_epi8(8); + + __m128i vbefore = _mm_load_si128((__m128i*)ref); + + for (int i = 0; i < 8; ++i) { + __m128i vbehind = _mm_loadu_si128((__m128i*)src_ptr); + + // Calculate the 7 interpolated lines between before and behind. Ordered by number from top to bottom. + __m128i vrow3 = _mm_avg_epu8(vbefore, vbehind); // Middle + __m128i vrow1 = _mm_avg_epu8(vrow3, vbefore); // Top middle + __m128i vrow5 = _mm_avg_epu8(vrow3, vbehind); // Bottom middle + __m128i vrow0 = _mm_avg_epu8(vbefore, vrow1); // Top middle top + __m128i vrow2 = _mm_avg_epu8(vrow1, vrow3); // Top middle bottom + __m128i vrow4 = _mm_avg_epu8(vrow3, vrow5); // Bottom middle top + __m128i vrow6 = _mm_avg_epu8(vrow5, vbehind); // Bottom middle bottom + + // Calculate the three and two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + __m128i diff = _mm_sub_epi8(vbehind, vbefore); + diff = _mm_and_si128(diff, sevens); + __m128i three_diff = _mm_and_si128(diff, threes); + + // Bottom side + __m128i mask = _mm_cmpgt_epi8(diff, fours); // The rounding error mask will be generated based on the calculated last bits. + __m128i sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 5, 6, 7 select one + vrow6 = _mm_sub_epi8(vrow6, sub_amount); + + mask = _mm_cmpeq_epi8(three_diff, threes); + sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 3 or 7 select one + vrow5 = _mm_sub_epi8(vrow5, sub_amount); + + __m128i is_two = _mm_cmpeq_epi8(diff, twos); + __m128i is_five = _mm_cmpeq_epi8(diff, fives); + mask = _mm_or_si128(mask, is_two); + mask = _mm_or_si128(mask, is_five); + sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 2, 3, 5, or 7 select one + vrow4 = _mm_sub_epi8(vrow4, sub_amount); + + // Top side + diff = _mm_blendv_epi8(diff, eights, _mm_cmpeq_epi8(zeros, diff)); // Replace zeros with eights to enable using GT + mask = _mm_cmpgt_epi8(diff, threes); + sub_amount = _mm_blendv_epi8(ones, zeros, mask); // If greater than three select zero + vrow0 = _mm_sub_epi8(vrow0, sub_amount); + + mask = _mm_cmpeq_epi8(three_diff, ones); + sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 1 or 5 select one + vrow1 = _mm_sub_epi8(vrow1, sub_amount); + + __m128i is_three = _mm_cmpeq_epi8(diff, threes); + __m128i is_six = _mm_cmpeq_epi8(diff, sixes); + mask = _mm_or_si128(mask, is_three); + mask = _mm_or_si128(mask, is_six); + sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 1, 3, 5, 6 select one + vrow2 = _mm_sub_epi8(vrow2, sub_amount); + + // Store results + _mm_store_si128((__m128i*)(dst_ptr + 0), vrow0); + _mm_store_si128((__m128i*)(dst_ptr + 16), vrow1); + _mm_store_si128((__m128i*)(dst_ptr + 32), vrow2); + _mm_store_si128((__m128i*)(dst_ptr + 48), vrow3); + _mm_store_si128((__m128i*)(dst_ptr + 64), vrow4); + _mm_store_si128((__m128i*)(dst_ptr + 80), vrow5); + _mm_store_si128((__m128i*)(dst_ptr + 96), vrow6); + + vbefore = vbehind; + src_ptr += 128; + dst_ptr += 128; + } +} + + + static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m256i vbefore = _mm256_load_si256((__m256i*)ref); @@ -7533,7 +7617,7 @@ void mip_predict_avx2( mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w16_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); } break; From 85ef3c537eddbf0ed17448b7535f6b6f406804bf Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 10 Apr 2024 16:47:23 +0300 Subject: [PATCH 148/237] Implement alternate w16 vertical 1 to 4 upsampling. --- src/strategies/avx2/intra-avx2.c | 59 +++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c24011de..37f7c6a5 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -6300,7 +6300,7 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst + 192), vavg6); _mm_store_si128((__m128i*)(dst + 224), vavg7); } -// + static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6348,6 +6348,52 @@ static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +static void mip_upsampling_w16_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +{ + const uvg_pixel* src_ptr = src; + const uvg_pixel* dst_ptr = dst; + + __m128i vbefore = _mm_load_si128((__m128i*)ref); + + const __m128i zeros = _mm_setzero_si128(); + const __m128i ones = _mm_set1_epi8(1); + const __m128i threes = _mm_set1_epi8(3); + + for (int i = 0; i < 8; ++i) { + __m128i vbehind = _mm_load_si128((__m128i*)src_ptr); + + // Calculate the 3 interpolated lines between before and behind. Top row, middle row and bottom row. + __m128i vmiddle = _mm_avg_epu8(vbefore, vbehind); + __m128i vtop = _mm_avg_epu8(vbefore, vmiddle); + __m128i vbottom = _mm_avg_epu8(vmiddle, vbehind); + + // Calculate the two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. + // Rounding error occurs in the left interpolated value if the two last bits of the difference between before and behind is 0b01. + __m128i diff = _mm_sub_epi8(vbehind, vbefore); + diff = _mm_and_si128(diff, threes); + __m128i mask = _mm_cmpeq_epi8(diff, ones); // The rounding error mask will be generated based on the calculated last bits. + __m128i sub_amount = _mm_blendv_epi8(zeros, ones, mask); + + vtop = _mm_sub_epi8(vtop, sub_amount); + + // Same rounding error handling for bottom interpolated values. + // Error happens if the two last bits of the difference between before and behind is 0b11. + mask = _mm_cmpeq_epi8(diff, threes); + sub_amount = _mm_blendv_epi8(zeros, ones, mask); + + vbottom = _mm_sub_epi8(vbottom, sub_amount); + + // Store results + _mm_store_si128((__m128i*)(dst_ptr + 0), vtop); + _mm_store_si128((__m128i*)(dst_ptr + 16), vmiddle); + _mm_store_si128((__m128i*)(dst_ptr + 32), vbottom); + + vbefore = vbehind; + src_ptr += 64; + dst_ptr += 64; + } +} + static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6409,6 +6455,7 @@ static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } +// Note: this alternate version is slower than the original version. It is kept here for reference. static void mip_upsampling_w16_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; @@ -6491,8 +6538,6 @@ static void mip_upsampling_w16_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg } } - - static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m256i vbefore = _mm256_load_si256((__m256i*)ref); @@ -7614,10 +7659,10 @@ void mip_predict_avx2( mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w16_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); // TODO: change this back to original function. } else { - mip_upsampling_w16_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); + mip_upsampling_w16_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); // TODO: change this back to the original function. } break; @@ -7629,7 +7674,6 @@ void mip_predict_avx2( mip_upsampling_w32_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); } else { - //mip_upsampling_w32_ups8_ver_avx2(tmp, ver_src, ref_samples_top); mip_upsampling_w32_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); } break; @@ -7639,12 +7683,9 @@ void mip_predict_avx2( mip_upsampling_w64_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - //mip_upsampling_w64_ups4_ver_avx2(tmp, ver_src, ref_samples_top); mip_upsampling_w64_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); } else { - //uvg_mip_pred_upsampling_1D_ver_avx2(tmp, ver_src, ref_samples_top, red_pred_size, width, ver_src_step, 1, width, 1, 1, ups_ver_factor); - //mip_upsampling_w64_ups8_ver_avx2(tmp, ver_src, ref_samples_top); mip_upsampling_w64_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); } break; From dee108af9bd14a1d998f75febe12cba835634082 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 10 Apr 2024 16:56:15 +0300 Subject: [PATCH 149/237] Change slower alt function calls back to original calls. --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 37f7c6a5..139c61d1 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -7659,10 +7659,10 @@ void mip_predict_avx2( mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w16_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); // TODO: change this back to original function. + mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - mip_upsampling_w16_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); // TODO: change this back to the original function. + mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; From a0f5af1510262e8e4eec0fc86e0ab9ee9bdebaef Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 12 Apr 2024 02:35:43 +0300 Subject: [PATCH 150/237] Implement 8 to 4 and 16 to 4 downsampling. --- src/strategies/avx2/intra-avx2.c | 126 ++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 139c61d1..b50d26b4 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4577,6 +4577,76 @@ void uvg_mip_boundary_downsampling_1D_avx2(uvg_pixel* reduced_dst, const uvg_pix } } +static void mip_ref_downsampling_1D_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +{ + const uint8_t down_smp_factor = 2; // width / red_bdry_size + const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; + const int rounding_offset = (1 << (log2_factor - 1)); + + const __m128i vrnd = _mm_set1_epi16(rounding_offset); + + __m128i vref = _mm_loadu_si128((__m128i*)ref_src); // Half the data is garbage and will be ignored. + vref = _mm_cvtepu8_epi16(vref); + __m128i vres = _mm_hadd_epi16(vref, vref); + vres = _mm_add_epi16(vres, vrnd); + vres = _mm_srli_epi16(vres, log2_factor); + __m128i vout = _mm_packus_epi16(vres, vres); + + *(int32_t*)reduced_dst = _mm_extract_epi32(vout, 0); +} + +static void mip_ref_downsampling_1D_16to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +{ + const uint8_t down_smp_factor = 4; // width / red_bdry_size + const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; + const int rounding_offset = (1 << (log2_factor - 1)); + + const __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + // TODO: try _mm256_dpbuud + __m128i vref = _mm_loadu_si128((__m128i*)ref_src); + __m256i vref256 = _mm256_cvtepu8_epi16(vref); + __m256i vres = _mm256_hadd_epi16(vref256, vref256); + vres = _mm256_hadd_epi16(vres, vres); + vres = _mm256_add_epi16(vres, vrnd); + vres = _mm256_srli_epi16(vres, log2_factor); + __m256i vout = _mm256_packus_epi16(vres, vres); + + *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi16(vout, 0); + *(int32_t*)(reduced_dst + 2) = _mm256_extract_epi16(vout, 8); +} + +static void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +{ + const uint8_t down_smp_factor = 8; // width / red_bdry_size + const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; + const int rounding_offset = (1 << (log2_factor - 1)); + + const __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m128i vrefa = _mm_loadu_si128((__m128i*)(ref_src + 0)); + __m128i vrefb = _mm_loadu_si128((__m128i*)(ref_src + 16)); + + __m256i vref256a = _mm256_cvtepu8_epi16(vrefa); + __m256i vref256b = _mm256_cvtepu8_epi16(vrefb); + + // These instructions cause error 0xC000001D: Illegal Instruction. + /*__m128i vtmpa = _mm_dpbuud_epi32(zeros, vrefa, ones); + __m128i vtmpb = _mm_dpbuud_epi32(zeros, vrefa, ones);*/ + + __m256i vres = _mm256_hadd_epi16(vref256a, vref256b); + vres = _mm256_hadd_epi16(vres, vres); + vres = _mm256_hadd_epi16(vres, vres); + vres = _mm256_hadd_epi16(vres, vres); + + vres = _mm256_add_epi32(vres, vrnd); + vres = _mm256_srli_epi32(vres, log2_factor); + __m256i vout = _mm256_packus_epi16(vres, vres); + + *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi16(vout, 0); + //*(int32_t*)(reduced_dst + 2) = _mm_extract_epi16(vout, 8); +} + void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, const int16_t* const input, @@ -7510,9 +7580,59 @@ void mip_predict_avx2( uvg_pixel* const top_reduced = &red_bdry[0]; uvg_pixel* const left_reduced = &red_bdry[red_bdry_size]; - // These work fine with uvg_pixel - uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); - uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); + // Horizontal downsampling + // uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); + uvg_pixel tmp[8]; + switch (width) { + case 4: + if (height == 4) { + uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); + } + else { + // No horizontal downsampling needed + // TODO: copy reference pixels + uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); + } + break; + case 8: + // TODO: for 8x8, make a specialized 2D function. + //uvg_mip_boundary_downsampling_1D_avx2(tmp, ref_samples_top, width, red_bdry_size); + mip_ref_downsampling_1D_8to4_avx2(top_reduced, ref_samples_top); + break; + case 16: + //uvg_mip_boundary_downsampling_1D_avx2(tmp, ref_samples_top, width, red_bdry_size); + mip_ref_downsampling_1D_16to4_avx2(top_reduced, ref_samples_top); + break; + case 32: + uvg_mip_boundary_downsampling_1D_avx2(tmp, ref_samples_top, width, red_bdry_size); + mip_ref_downsampling_1D_32to4_avx2(top_reduced, ref_samples_top); + break; + case 64: uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); break; + default: + assert(false && "MIP horizontal downsampling. Invalid width.\n"); + break; + } + + // Vertical downsampling + // uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); + switch (height) { + case 4: + if (width == 4) { + uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); + } + else { + // No vertical downsampling needed + // TODO: copy reference pixels + uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); + } + case 8: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; + case 16: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; + case 32: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; + case 64: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; + default: + assert(false && "MIP vertical downsampling. Invalid height.\n"); + break; + } // Transposed reduced boundaries uvg_pixel* const left_reduced_trans = &red_bdry_trans[0]; From 96037f9daf2074c9e8a3d34a989ebff2e0272005 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 12 Apr 2024 14:52:19 +0300 Subject: [PATCH 151/237] Implement 32 to 4 and 64 to 4 downsampling. --- src/strategies/avx2/intra-avx2.c | 73 +++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b50d26b4..c58ae9d4 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4607,13 +4607,13 @@ static void mip_ref_downsampling_1D_16to4_avx2(uvg_pixel* reduced_dst, const uvg __m128i vref = _mm_loadu_si128((__m128i*)ref_src); __m256i vref256 = _mm256_cvtepu8_epi16(vref); __m256i vres = _mm256_hadd_epi16(vref256, vref256); + vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); vres = _mm256_hadd_epi16(vres, vres); vres = _mm256_add_epi16(vres, vrnd); vres = _mm256_srli_epi16(vres, log2_factor); __m256i vout = _mm256_packus_epi16(vres, vres); - *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi16(vout, 0); - *(int32_t*)(reduced_dst + 2) = _mm256_extract_epi16(vout, 8); + *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi32(vout, 0); } static void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) @@ -4635,15 +4635,56 @@ static void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg __m128i vtmpb = _mm_dpbuud_epi32(zeros, vrefa, ones);*/ __m256i vres = _mm256_hadd_epi16(vref256a, vref256b); + vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); vres = _mm256_hadd_epi16(vres, vres); - vres = _mm256_hadd_epi16(vres, vres); + vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); vres = _mm256_hadd_epi16(vres, vres); - vres = _mm256_add_epi32(vres, vrnd); - vres = _mm256_srli_epi32(vres, log2_factor); + vres = _mm256_add_epi16(vres, vrnd); + vres = _mm256_srli_epi16(vres, log2_factor); __m256i vout = _mm256_packus_epi16(vres, vres); - *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi16(vout, 0); + *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi32(vout, 0); + //*(int32_t*)(reduced_dst + 2) = _mm_extract_epi16(vout, 8); +} + +static void mip_ref_downsampling_1D_64to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +{ + const uint8_t down_smp_factor = 16; // width / red_bdry_size + const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; + const int rounding_offset = (1 << (log2_factor - 1)); + + const __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + __m128i vrefa = _mm_loadu_si128((__m128i*)(ref_src + 0)); + __m128i vrefb = _mm_loadu_si128((__m128i*)(ref_src + 16)); + __m128i vrefc = _mm_loadu_si128((__m128i*)(ref_src + 32)); + __m128i vrefd = _mm_loadu_si128((__m128i*)(ref_src + 48)); + + __m256i vref256a = _mm256_cvtepu8_epi16(vrefa); + __m256i vref256b = _mm256_cvtepu8_epi16(vrefb); + __m256i vref256c = _mm256_cvtepu8_epi16(vrefc); + __m256i vref256d = _mm256_cvtepu8_epi16(vrefd); + + + __m256i vres0 = _mm256_hadd_epi16(vref256a, vref256b); + __m256i vres1 = _mm256_hadd_epi16(vref256c, vref256d); + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); + + vres0 = _mm256_hadd_epi16(vres0, vres1); + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + + vres0 = _mm256_hadd_epi16(vres0, vres0); + vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); + + vres0 = _mm256_hadd_epi16(vres0, vres0); + + vres0 = _mm256_add_epi16(vres0, vrnd); + vres0 = _mm256_srli_epi16(vres0, log2_factor); + __m256i vout = _mm256_packus_epi16(vres0, vres0); + + *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi32(vout, 0); //*(int32_t*)(reduced_dst + 2) = _mm_extract_epi16(vout, 8); } @@ -7582,7 +7623,6 @@ void mip_predict_avx2( // Horizontal downsampling // uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); - uvg_pixel tmp[8]; switch (width) { case 4: if (height == 4) { @@ -7594,20 +7634,11 @@ void mip_predict_avx2( uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); } break; - case 8: - // TODO: for 8x8, make a specialized 2D function. - //uvg_mip_boundary_downsampling_1D_avx2(tmp, ref_samples_top, width, red_bdry_size); - mip_ref_downsampling_1D_8to4_avx2(top_reduced, ref_samples_top); - break; - case 16: - //uvg_mip_boundary_downsampling_1D_avx2(tmp, ref_samples_top, width, red_bdry_size); - mip_ref_downsampling_1D_16to4_avx2(top_reduced, ref_samples_top); - break; - case 32: - uvg_mip_boundary_downsampling_1D_avx2(tmp, ref_samples_top, width, red_bdry_size); - mip_ref_downsampling_1D_32to4_avx2(top_reduced, ref_samples_top); - break; - case 64: uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); break; + // TODO: for 8x8, make a specialized 2D function. + case 8: mip_ref_downsampling_1D_8to4_avx2(top_reduced, ref_samples_top); break; + case 16: mip_ref_downsampling_1D_16to4_avx2(top_reduced, ref_samples_top); break; + case 32: mip_ref_downsampling_1D_32to4_avx2(top_reduced, ref_samples_top); break; + case 64: mip_ref_downsampling_1D_64to4_avx2(top_reduced, ref_samples_top); break; default: assert(false && "MIP horizontal downsampling. Invalid width.\n"); break; From d9e2ee3f26417e00d17902c460c33a0d82ef6c32 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 12 Apr 2024 16:09:05 +0300 Subject: [PATCH 152/237] Implement downsampling for 4x4 and 8x8 blocks. Do the downsampling for both dimensions inside a single function. --- src/strategies/avx2/intra-avx2.c | 132 +++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 41 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c58ae9d4..16b449ea 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4577,6 +4577,55 @@ void uvg_mip_boundary_downsampling_1D_avx2(uvg_pixel* reduced_dst, const uvg_pix } } +static void mip_ref_downsampling_4x4_4to2_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) +{ + const uint8_t down_smp_factor = 2; // width / red_bdry_size + const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; + const int rounding_offset = (1 << (log2_factor - 1)); + + const __m128i vrnd = _mm_set1_epi16(rounding_offset); + + ALIGNED(16) uint32_t ref[2]; + ref[0] = *(uint32_t*)ref_top; + ref[1] = *(uint32_t*)ref_left; + + __m128i vref = _mm_load_si128((__m128i*)ref); + vref = _mm_cvtepu8_epi16(vref); + + __m128i vres = _mm_hadd_epi16(vref, vref); + + vres = _mm_add_epi16(vres, vrnd); + vres = _mm_srli_epi16(vres, log2_factor); + __m128i vout = _mm_packus_epi16(vres, vres); + + *(uint32_t*)reduced_dst = _mm_extract_epi32(vout, 0); +} + +static void mip_ref_downsampling_8x8_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) +{ + const uint8_t down_smp_factor = 2; // width / red_bdry_size + const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; + const int rounding_offset = (1 << (log2_factor - 1)); + + const __m256i vrnd = _mm256_set1_epi16(rounding_offset); + + ALIGNED(16) uint64_t ref[2]; + ref[0] = *(uint64_t*)ref_top; + ref[1] = *(uint64_t*)ref_left; + + __m128i vref = _mm_load_si128((__m128i*)ref); + __m256i vref256 = _mm256_cvtepu8_epi16(vref); + + __m256i vres = _mm256_hadd_epi16(vref256, vref256); + vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); + + vres = _mm256_add_epi16(vres, vrnd); + vres = _mm256_srli_epi16(vres, log2_factor); + __m256i vout = _mm256_packus_epi16(vres, vres); + + *(uint64_t*)reduced_dst = _mm256_extract_epi64(vout, 0); +} + static void mip_ref_downsampling_1D_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) { const uint8_t down_smp_factor = 2; // width / red_bdry_size @@ -7621,49 +7670,50 @@ void mip_predict_avx2( uvg_pixel* const top_reduced = &red_bdry[0]; uvg_pixel* const left_reduced = &red_bdry[red_bdry_size]; - // Horizontal downsampling - // uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); - switch (width) { - case 4: - if (height == 4) { - uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); - } - else { - // No horizontal downsampling needed - // TODO: copy reference pixels - uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); - } - break; - // TODO: for 8x8, make a specialized 2D function. - case 8: mip_ref_downsampling_1D_8to4_avx2(top_reduced, ref_samples_top); break; - case 16: mip_ref_downsampling_1D_16to4_avx2(top_reduced, ref_samples_top); break; - case 32: mip_ref_downsampling_1D_32to4_avx2(top_reduced, ref_samples_top); break; - case 64: mip_ref_downsampling_1D_64to4_avx2(top_reduced, ref_samples_top); break; - default: - assert(false && "MIP horizontal downsampling. Invalid width.\n"); - break; + if (width == 4 && height == 4) { + // 4 to 2 downsampling for both dimensions + mip_ref_downsampling_4x4_4to2_avx2(top_reduced, ref_samples_top, ref_samples_left); } - - // Vertical downsampling - // uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); - switch (height) { - case 4: - if (width == 4) { - uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); - } - else { - // No vertical downsampling needed - // TODO: copy reference pixels - uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); - } - case 8: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; - case 16: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; - case 32: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; - case 64: uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); break; - default: - assert(false && "MIP vertical downsampling. Invalid height.\n"); - break; + else if (width == 8 && height == 8) { + // 8 to 4 downsampling for both dimensions + mip_ref_downsampling_8x8_8to4_avx2(top_reduced, ref_samples_top, ref_samples_left); } + else { + // Horizontal downsampling + // uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); + switch (width) { + case 4: + // 4x4 case handled elsewhere. + // No horizontal downsampling needed. Copy pixels. + memcpy(top_reduced, ref_samples_top, 4 * sizeof(uvg_pixel)); + break; + case 8: mip_ref_downsampling_1D_8to4_avx2(top_reduced, ref_samples_top); break; // 8x8 case handled elsewhere. + case 16: mip_ref_downsampling_1D_16to4_avx2(top_reduced, ref_samples_top); break; + case 32: mip_ref_downsampling_1D_32to4_avx2(top_reduced, ref_samples_top); break; + case 64: mip_ref_downsampling_1D_64to4_avx2(top_reduced, ref_samples_top); break; + default: + assert(false && "MIP horizontal downsampling. Invalid width.\n"); + break; + } + + // Vertical downsampling + // uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); + switch (height) { + case 4: + // 4x4 case handled elsewhere. + // No vertical downsampling needed. Copy pixels. + memcpy(left_reduced, ref_samples_left, 4 * sizeof(uvg_pixel)); + break; + case 8: mip_ref_downsampling_1D_8to4_avx2(left_reduced, ref_samples_left); break; // 8x8 case handled elsewhere. + case 16: mip_ref_downsampling_1D_16to4_avx2(left_reduced, ref_samples_left); break; + case 32: mip_ref_downsampling_1D_32to4_avx2(left_reduced, ref_samples_left); break; + case 64: mip_ref_downsampling_1D_64to4_avx2(left_reduced, ref_samples_left); break; + default: + assert(false && "MIP vertical downsampling. Invalid height.\n"); + break; + } + } + // Transposed reduced boundaries uvg_pixel* const left_reduced_trans = &red_bdry_trans[0]; From 2150cf046a6abd5dca1e5e9162f2a392aa9a1ed2 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 15 Apr 2024 12:05:25 +0300 Subject: [PATCH 153/237] Remove separate output buffer and use passed dst buffer directly. This removes the need to assign results in the end. --- src/strategies/avx2/intra-avx2.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 16b449ea..fa5bc45d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -7626,8 +7626,9 @@ void mip_predict_avx2( { // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative - uvg_pixel* out = dst; - uvg_pixel result[64 * 64] = { 0 }; + //uvg_pixel* out = dst; + //uvg_pixel result[64 * 64] = { 0 }; + uvg_pixel* result = dst; const int mode_idx = mip_mode; // *** INPUT PREP *** @@ -7897,11 +7898,6 @@ void mip_predict_avx2( } } } - - // Assign and cast values from temp array to output - for (int i = 0; i < width * height; i++) { - out[i] = (uvg_pixel)result[i]; - } // *** BLOCK PREDICT *** END } From 422c9d7a6327570cda206c98d88ee055276ae8a5 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 15 Apr 2024 14:47:29 +0300 Subject: [PATCH 154/237] Clean code. Add some comments. INLINE all downsampling functions. --- src/strategies/avx2/intra-avx2.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index fa5bc45d..5ef088b8 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4577,7 +4577,7 @@ void uvg_mip_boundary_downsampling_1D_avx2(uvg_pixel* reduced_dst, const uvg_pix } } -static void mip_ref_downsampling_4x4_4to2_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) +static INLINE void mip_ref_downsampling_4x4_4to2_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) { const uint8_t down_smp_factor = 2; // width / red_bdry_size const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; @@ -4601,7 +4601,7 @@ static void mip_ref_downsampling_4x4_4to2_avx2(uvg_pixel* reduced_dst, const uvg *(uint32_t*)reduced_dst = _mm_extract_epi32(vout, 0); } -static void mip_ref_downsampling_8x8_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) +static INLINE void mip_ref_downsampling_8x8_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) { const uint8_t down_smp_factor = 2; // width / red_bdry_size const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; @@ -4626,7 +4626,7 @@ static void mip_ref_downsampling_8x8_8to4_avx2(uvg_pixel* reduced_dst, const uvg *(uint64_t*)reduced_dst = _mm256_extract_epi64(vout, 0); } -static void mip_ref_downsampling_1D_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +static INLINE void mip_ref_downsampling_1D_8to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) { const uint8_t down_smp_factor = 2; // width / red_bdry_size const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; @@ -4644,7 +4644,7 @@ static void mip_ref_downsampling_1D_8to4_avx2(uvg_pixel* reduced_dst, const uvg_ *(int32_t*)reduced_dst = _mm_extract_epi32(vout, 0); } -static void mip_ref_downsampling_1D_16to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +static INLINE void mip_ref_downsampling_1D_16to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) { const uint8_t down_smp_factor = 4; // width / red_bdry_size const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; @@ -4652,7 +4652,9 @@ static void mip_ref_downsampling_1D_16to4_avx2(uvg_pixel* reduced_dst, const uvg const __m256i vrnd = _mm256_set1_epi16(rounding_offset); - // TODO: try _mm256_dpbuud + // TODO: try _mm256_dpbuud. + // NOTE: ignore this TODO for now, using dpbuud causes error 0xC000001D: Illegal Instruction. + // The instruction requires a newer CPU. __m128i vref = _mm_loadu_si128((__m128i*)ref_src); __m256i vref256 = _mm256_cvtepu8_epi16(vref); __m256i vres = _mm256_hadd_epi16(vref256, vref256); @@ -4665,7 +4667,7 @@ static void mip_ref_downsampling_1D_16to4_avx2(uvg_pixel* reduced_dst, const uvg *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi32(vout, 0); } -static void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +static INLINE void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) { const uint8_t down_smp_factor = 8; // width / red_bdry_size const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; @@ -4678,10 +4680,6 @@ static void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg __m256i vref256a = _mm256_cvtepu8_epi16(vrefa); __m256i vref256b = _mm256_cvtepu8_epi16(vrefb); - - // These instructions cause error 0xC000001D: Illegal Instruction. - /*__m128i vtmpa = _mm_dpbuud_epi32(zeros, vrefa, ones); - __m128i vtmpb = _mm_dpbuud_epi32(zeros, vrefa, ones);*/ __m256i vres = _mm256_hadd_epi16(vref256a, vref256b); vres = _mm256_permute4x64_epi64(vres, _MM_SHUFFLE(3, 1, 2, 0)); @@ -4694,10 +4692,9 @@ static void mip_ref_downsampling_1D_32to4_avx2(uvg_pixel* reduced_dst, const uvg __m256i vout = _mm256_packus_epi16(vres, vres); *(int32_t*)(reduced_dst + 0) = _mm256_extract_epi32(vout, 0); - //*(int32_t*)(reduced_dst + 2) = _mm_extract_epi16(vout, 8); } -static void mip_ref_downsampling_1D_64to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) +static INLINE void mip_ref_downsampling_1D_64to4_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src) { const uint8_t down_smp_factor = 16; // width / red_bdry_size const int log2_factor = uvg_g_convert_to_log2[down_smp_factor]; @@ -4738,6 +4735,7 @@ static void mip_ref_downsampling_1D_64to4_avx2(uvg_pixel* reduced_dst, const uvg } +// This function is not optimized, do not use in production. It is left here for reference. void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, const int16_t* const input, const uint8_t* matrix, @@ -7839,7 +7837,6 @@ void mip_predict_avx2( break; case 8: - // TODO: remove the if clauses and add a switch for height if (ups_ver_factor == 2) { if (height == 8) { mip_upsampling_w8_ups2_h8_ver_avx2(result, ver_src, ref_samples_top); From d95d5846d0c14c8b706bb4b4ca83a49c1edaf2a7 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 15 Apr 2024 16:51:37 +0300 Subject: [PATCH 155/237] Implement AVX2 transpose after mip predict for size id 0. Remove uvg prefix from sid0 function. TODO: remove this from all other static functions too. --- src/strategies/avx2/intra-avx2.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5ef088b8..cc2b0945 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4735,7 +4735,7 @@ static INLINE void mip_ref_downsampling_1D_64to4_avx2(uvg_pixel* reduced_dst, co } -// This function is not optimized, do not use in production. It is left here for reference. +// This function is not optimized, do not use in production. It is left here for reference. void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, const int16_t* const input, const uint8_t* matrix, @@ -4796,7 +4796,7 @@ void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, // Size ID 0 -void uvg_mip_reduced_pred_sid0_avx2(uvg_pixel* const output, +static INLINE void mip_reduced_pred_sid0_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, const bool transpose, @@ -4804,12 +4804,8 @@ void uvg_mip_reduced_pred_sid0_avx2(uvg_pixel* const output, const int in_offset_tr) { const int input_size = 4; - const int pred_size = 4; - const int size_id = 0; - - // Use local buffer for transposed result - uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH - uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; + // const int pred_size = 4; + // const int size_id = 0; int sum = 0; for (int i = 0; i < input_size; i++) { @@ -4828,6 +4824,11 @@ void uvg_mip_reduced_pred_sid0_avx2(uvg_pixel* const output, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ); + + const __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); const __m128i vinraw = _mm_loadu_si128((__m128i*)input); const __m128i vin = _mm_shuffle_epi8(vinraw, vshuf); @@ -4883,14 +4884,12 @@ void uvg_mip_reduced_pred_sid0_avx2(uvg_pixel* const output, __m128i vres16_b = _mm_packus_epi32(vresult0, vresult1); __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); - _mm_storeu_si128((__m128i*)out_ptr, vres8); - if (transpose) { - for (int y = 0; y < pred_size; y++) { - for (int x = 0; x < pred_size; x++) { - output[y * pred_size + x] = out_ptr[x * pred_size + y]; - } - } + vres8 = _mm_shuffle_epi8(vres8, vtranspose); + _mm_storeu_si128((__m128i*)output, vres8); + } + else { + _mm_storeu_si128((__m128i*)output, vres8); } } @@ -7772,7 +7771,7 @@ void mip_predict_avx2( const int16_t* const reduced_bdry16 = transpose ? red_bdry_trans16 : red_bdry16; switch (size_id) { - case 0: uvg_mip_reduced_pred_sid0_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; + case 0: mip_reduced_pred_sid0_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; case 1: // Size id 1 can use the same function as size id 2 case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, red_pred_size, transpose, input_offset, input_offset_trans); break; default: From 3585d9308a53919674f3f0ab6a4d0d7c7116dc2f Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 17 Apr 2024 00:43:03 +0300 Subject: [PATCH 156/237] Implement AVX2 transpose for MIP predictors. Re-implement size id 1 MIP predictor, size id 2 version was doing unnecessary work when used for some size id 1 cases. --- src/strategies/avx2/intra-avx2.c | 248 ++++++++++++++++--------------- 1 file changed, 130 insertions(+), 118 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index cc2b0945..a52de376 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4905,10 +4905,6 @@ void uvg_mip_reduced_pred_sid1_avx2(uvg_pixel* const output, const int pred_size = 4; const int size_id = 1; - // Use local buffer for transposed result - uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH - uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; - int sum = 0; for (int i = 0; i < input_size; i++) { sum += input[i]; @@ -4934,6 +4930,10 @@ void uvg_mip_reduced_pred_sid1_avx2(uvg_pixel* const output, const __m128i vshuf3 = _mm_setr_epi8( 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f, 0x0c, 0x0d, 0x0e, 0x0f); + const __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); const __m128i vinraw = _mm_loadu_si128((__m128i*)input); @@ -4943,130 +4943,111 @@ void uvg_mip_reduced_pred_sid1_avx2(uvg_pixel* const output, const __m128i vin3 = _mm_shuffle_epi8(vinraw, vshuf3); - for (int y = 0; y < pred_size; y += 2) { - // Calculate row 1, first 4 - __m128i vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); - __m128i vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); - __m128i vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); - __m128i vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); - - __m128i vmadd0 = _mm_madd_epi16(vin0, vweight0); - __m128i vmadd1 = _mm_madd_epi16(vin1, vweight1); - __m128i vmadd2 = _mm_madd_epi16(vin2, vweight2); - __m128i vmadd3 = _mm_madd_epi16(vin3, vweight3); - - __m128i vadd0 = _mm_add_epi32(vmadd0, vmadd1); - __m128i vadd1 = _mm_add_epi32(vmadd2, vmadd3); - - __m128i result0 = _mm_add_epi32(vadd0, vadd1); - - result0 = _mm_add_epi32(result0, vofs); - result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); - result0 = _mm_add_epi32(result0, vinofs); - - weight += input_size * 4; - - // Calculate row 1, last 4 - vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); - vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); - vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); - vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); - - vmadd0 = _mm_madd_epi16(vin0, vweight0); - vmadd1 = _mm_madd_epi16(vin1, vweight1); - vmadd2 = _mm_madd_epi16(vin2, vweight2); - vmadd3 = _mm_madd_epi16(vin3, vweight3); - - vadd0 = _mm_add_epi32(vmadd0, vmadd1); - vadd1 = _mm_add_epi32(vmadd2, vmadd3); - - __m128i result1 = _mm_add_epi32(vadd0, vadd1); - - result1 = _mm_add_epi32(result1, vofs); - result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); - result1 = _mm_add_epi32(result1, vinofs); - - __m128i vres16_a = _mm_packus_epi32(result0, result1); - - weight += input_size * 4; - - // Calculate row 2, first 4 - vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); - vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); - vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); - vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); - - vmadd0 = _mm_madd_epi16(vin0, vweight0); - vmadd1 = _mm_madd_epi16(vin1, vweight1); - vmadd2 = _mm_madd_epi16(vin2, vweight2); - vmadd3 = _mm_madd_epi16(vin3, vweight3); - - vadd0 = _mm_add_epi32(vmadd0, vmadd1); - vadd1 = _mm_add_epi32(vmadd2, vmadd3); - - result0 = _mm_add_epi32(vadd0, vadd1); - - result0 = _mm_add_epi32(result0, vofs); - result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); - result0 = _mm_add_epi32(result0, vinofs); - - weight += input_size * 4; - - // Calculate row 2, last 4 - vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); - vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); - vweight2 = _mm_loadu_si128((__m128i*) &weight[16]); - vweight3 = _mm_loadu_si128((__m128i*) &weight[24]); - - vmadd0 = _mm_madd_epi16(vin0, vweight0); - vmadd1 = _mm_madd_epi16(vin1, vweight1); - vmadd2 = _mm_madd_epi16(vin2, vweight2); - vmadd3 = _mm_madd_epi16(vin3, vweight3); - - vadd0 = _mm_add_epi32(vmadd0, vmadd1); - vadd1 = _mm_add_epi32(vmadd2, vmadd3); - - result1 = _mm_add_epi32(vadd0, vadd1); - - result1 = _mm_add_epi32(result1, vofs); - result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); - result1 = _mm_add_epi32(result1, vinofs); - - __m128i vres16_b = _mm_packus_epi32(result0, result1); - __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); - - _mm_storeu_si128((__m128i*)out_ptr, vres8); + // Calculate row 1, first 4 + __m128i vweight0 = _mm_loadu_si128((__m128i*)&weight[0]); + __m128i vweight1 = _mm_loadu_si128((__m128i*)&weight[8]); + __m128i vweight2 = _mm_loadu_si128((__m128i*)&weight[16]); + __m128i vweight3 = _mm_loadu_si128((__m128i*)&weight[24]); + __m128i vmadd0 = _mm_madd_epi16(vin0, vweight0); + __m128i vmadd1 = _mm_madd_epi16(vin1, vweight1); + __m128i vmadd2 = _mm_madd_epi16(vin2, vweight2); + __m128i vmadd3 = _mm_madd_epi16(vin3, vweight3); + __m128i vadd0 = _mm_add_epi32(vmadd0, vmadd1); + __m128i vadd1 = _mm_add_epi32(vmadd2, vmadd3); + __m128i result0 = _mm_add_epi32(vadd0, vadd1); + result0 = _mm_add_epi32(result0, vofs); + result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); + result0 = _mm_add_epi32(result0, vinofs); + + weight += input_size * 4; + + // Calculate row 1, last 4 + vweight0 = _mm_loadu_si128((__m128i*)&weight[0]); + vweight1 = _mm_loadu_si128((__m128i*)&weight[8]); + vweight2 = _mm_loadu_si128((__m128i*)&weight[16]); + vweight3 = _mm_loadu_si128((__m128i*)&weight[24]); + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + __m128i result1 = _mm_add_epi32(vadd0, vadd1); + result1 = _mm_add_epi32(result1, vofs); + result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); + result1 = _mm_add_epi32(result1, vinofs); + + __m128i vres16_a = _mm_packus_epi32(result0, result1); + + + weight += input_size * 4; + // Calculate row 2, first 4 + vweight0 = _mm_loadu_si128((__m128i*)&weight[0]); + vweight1 = _mm_loadu_si128((__m128i*)&weight[8]); + vweight2 = _mm_loadu_si128((__m128i*)&weight[16]); + vweight3 = _mm_loadu_si128((__m128i*)&weight[24]); + + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + + result0 = _mm_add_epi32(vadd0, vadd1); + + result0 = _mm_add_epi32(result0, vofs); + result0 = _mm_srai_epi32(result0, MIP_SHIFT_MATRIX); + result0 = _mm_add_epi32(result0, vinofs); + + weight += input_size * 4; + // Calculate row 2, last 4 + vweight0 = _mm_loadu_si128((__m128i*)&weight[0]); + vweight1 = _mm_loadu_si128((__m128i*)&weight[8]); + vweight2 = _mm_loadu_si128((__m128i*)&weight[16]); + vweight3 = _mm_loadu_si128((__m128i*)&weight[24]); + vmadd0 = _mm_madd_epi16(vin0, vweight0); + vmadd1 = _mm_madd_epi16(vin1, vweight1); + vmadd2 = _mm_madd_epi16(vin2, vweight2); + vmadd3 = _mm_madd_epi16(vin3, vweight3); + vadd0 = _mm_add_epi32(vmadd0, vmadd1); + vadd1 = _mm_add_epi32(vmadd2, vmadd3); + result1 = _mm_add_epi32(vadd0, vadd1); + result1 = _mm_add_epi32(result1, vofs); + result1 = _mm_srai_epi32(result1, MIP_SHIFT_MATRIX); + result1 = _mm_add_epi32(result1, vinofs); + __m128i vres16_b = _mm_packus_epi32(result0, result1); + __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); + if (transpose) { + vres8 = _mm_shuffle_epi8(vres8, vtranspose); + _mm_storeu_si128((__m128i*)output, vres8); - //out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); - out_ptr += 16; - weight += input_size * 4; + } else { + _mm_storeu_si128((__m128i*)output, vres8); } - - if (transpose) { + /*if (transpose) { for (int y = 0; y < pred_size; y++) { for (int x = 0; x < pred_size; x++) { output[y * pred_size + x] = out_ptr[x * pred_size + y]; } } - } + }*/ } // Size ID 2 void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, - const int red_pred_size, const bool transpose, const int in_offset, const int in_offset_tr) { const int input_size = 8; - const int pred_size = red_pred_size; + const int pred_size = 8; const int size_id = 2; - // Use local buffer for transposed result - uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH - uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; + uvg_pixel * out_ptr = output; int sum = 0; for (int i = 0; i < input_size; i++) { @@ -5100,9 +5081,14 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const __m128i vin1 = _mm_shuffle_epi8(vinraw, vshuf1); const __m128i vin2 = _mm_shuffle_epi8(vinraw, vshuf2); const __m128i vin3 = _mm_shuffle_epi8(vinraw, vshuf3); + __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, + 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f + ); + + __m128i vtmpres[4]; - // TODO: this does one unnecessary loop for sizes 8x4 and 4x8. Solve this. - for (int y = 0; y < pred_size; y += 2) { + for (int y = 0, tmp = 0; y < pred_size; y += 2, ++tmp) { // Calculate row 1, first 4 __m128i vweight0 = _mm_loadu_si128((__m128i*) &weight[0]); __m128i vweight1 = _mm_loadu_si128((__m128i*) &weight[8]); @@ -5194,7 +5180,14 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, __m128i vres16_b = _mm_packus_epi32(result0, result1); __m128i vres8 = _mm_packus_epi16(vres16_a, vres16_b); - _mm_storeu_si128((__m128i*)out_ptr, vres8); + if (transpose) { + // Store into temporary storage, transpose later + vtmpres[tmp] = vres8; + } + else { + _mm_storeu_si128((__m128i*)out_ptr, vres8); + out_ptr += 16; + } //out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); out_ptr += 16; @@ -5202,11 +5195,30 @@ void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, } if (transpose) { - for (int y = 0; y < pred_size; y++) { - for (int x = 0; x < pred_size; x++) { - output[y * pred_size + x] = out_ptr[x * pred_size + y]; - } - } + vtmpres[0] = _mm_shuffle_epi8(vtmpres[0], vtranspose); + vtmpres[1] = _mm_shuffle_epi8(vtmpres[1], vtranspose); + vtmpres[2] = _mm_shuffle_epi8(vtmpres[2], vtranspose); + vtmpres[3] = _mm_shuffle_epi8(vtmpres[3], vtranspose); + + __m128i v16lo0 = _mm_unpacklo_epi16(vtmpres[0], vtmpres[1]); + __m128i v16lo1 = _mm_unpacklo_epi16(vtmpres[2], vtmpres[3]); + __m128i v16hi0 = _mm_unpackhi_epi16(vtmpres[0], vtmpres[1]); + __m128i v16hi1 = _mm_unpackhi_epi16(vtmpres[2], vtmpres[3]); + + __m128i v32lo0 = _mm_unpacklo_epi32(v16lo0, v16lo1); + __m128i v32lo1 = _mm_unpacklo_epi32(v16hi0, v16hi1); + __m128i v32hi0 = _mm_unpackhi_epi32(v16lo0, v16lo1); + __m128i v32hi1 = _mm_unpackhi_epi32(v16hi0, v16hi1); + + /*__m128i vout0 = _mm_unpacklo_epi64(v32lo0, v32hi0); + __m128i vout1 = _mm_unpacklo_epi64(v32lo1, v32hi1); + __m128i vout2 = _mm_unpackhi_epi64(v32lo0, v32hi0); + __m128i vout3 = _mm_unpackhi_epi64(v32lo1, v32hi1);*/ + + _mm_store_si128((__m128i*)(output + 0), v32lo0); + _mm_store_si128((__m128i*)(output + 16), v32hi0); + _mm_store_si128((__m128i*)(output + 32), v32lo1); + _mm_store_si128((__m128i*)(output + 48), v32hi1); } } @@ -7772,8 +7784,8 @@ void mip_predict_avx2( switch (size_id) { case 0: mip_reduced_pred_sid0_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; - case 1: // Size id 1 can use the same function as size id 2 - case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, red_pred_size, transpose, input_offset, input_offset_trans); break; + case 1: uvg_mip_reduced_pred_sid1_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; + case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; default: assert(false && "Intra MIP: invalid size id.\n"); break; From 5bdffd453b257bd9d73ce9347d2caad935e2aad5 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 17 Apr 2024 00:59:15 +0300 Subject: [PATCH 157/237] Inline MIP predictors. Remove uvg prefix. --- src/strategies/avx2/intra-avx2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a52de376..b19a1f58 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4894,7 +4894,7 @@ static INLINE void mip_reduced_pred_sid0_avx2(uvg_pixel* const output, } // Size ID 1 -void uvg_mip_reduced_pred_sid1_avx2(uvg_pixel* const output, +void INLINE mip_reduced_pred_sid1_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, const bool transpose, @@ -5036,7 +5036,7 @@ void uvg_mip_reduced_pred_sid1_avx2(uvg_pixel* const output, } // Size ID 2 -void uvg_mip_reduced_pred_sid2_avx2(uvg_pixel* const output, +void INLINE mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, const bool transpose, @@ -7784,8 +7784,8 @@ void mip_predict_avx2( switch (size_id) { case 0: mip_reduced_pred_sid0_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; - case 1: uvg_mip_reduced_pred_sid1_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; - case 2: uvg_mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; + case 1: mip_reduced_pred_sid1_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; + case 2: mip_reduced_pred_sid2_avx2(reduced_pred, reduced_bdry16, matrix16, transpose, input_offset, input_offset_trans); break; default: assert(false && "Intra MIP: invalid size id.\n"); break; From 8642d66fde6c157568be042a8ecb9f8593165163 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 17 Apr 2024 13:41:16 +0300 Subject: [PATCH 158/237] Clean code. --- src/strategies/avx2/intra-avx2.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b19a1f58..05e93bcb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -90,7 +90,6 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 31. line == 2 }; -// TODO: Reduce size back to 2048 if last line is not needed ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, // offset 0, line == 64 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, @@ -729,17 +728,6 @@ static void uvg_angular_pred_avx2_old( } -// TODO: vectorize -// TODO: obsolete, remove all usage -static void angular_pred_avx2_delta_tables(int64_t* delta_int, int32_t* delta_fract, const int line, const int mrl, const int sample_disp) -{ - for (int i = 0, delta_pos = sample_disp * (1 + mrl); i < line; ++i, delta_pos += sample_disp) { - delta_int[i] = delta_pos >> 5; - delta_fract[i] = delta_pos & (32 - 1); - } -} - - static const int16_t cubic_filter[32][4] = { { 0, 64, 0, 0 }, From 85dc148ce54190dea7fbaf630b2a02baff97cbb0 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 17 Apr 2024 16:09:06 +0300 Subject: [PATCH 159/237] Improve w8 vertical angular prediction. Replace gather with loads. Replace extract with store. --- src/strategies/avx2/intra-avx2.c | 43 +++++++++++++------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 05e93bcb..a31bfed0 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -931,31 +931,27 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, { const int width = 8; - const __m256i p_shuf_01 = _mm256_setr_epi8( - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + const __m128i p_shuf_01 = _mm_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 ); - const __m256i p_shuf_23 = _mm256_setr_epi8( + const __m128i p_shuf_23 = _mm_setr_epi8( 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, - 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e + 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a ); const __m256i w_shuf_01 = _mm256_setr_epi8( 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a ); const __m256i w_shuf_23 = _mm256_setr_epi8( 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, + 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e ); @@ -964,8 +960,8 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i all_weights; if (use_cubic) { int16_t tmp[8]; - memcpy(&tmp[0], cubic_filter[delta_fract[y + 0]], 8); - memcpy(&tmp[4], cubic_filter[delta_fract[y + 1]], 8); + memcpy(&tmp[0], cubic_filter[delta_fract[y + 0]], 4 * sizeof(int16_t)); + memcpy(&tmp[4], cubic_filter[delta_fract[y + 1]], 4 * sizeof(int16_t)); all_weights = _mm256_setr_epi64x(*(int64_t*)&tmp[0], *(int64_t*)&tmp[4], *(int64_t*)&tmp[0], *(int64_t*)&tmp[4]); } else { @@ -983,20 +979,20 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, // Do 4-tap intra interpolation filtering uvg_pixel* p = (uvg_pixel*)ref_main; - // This solution assumes the delta int values to be 64-bit - // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], - delta_int[y + 1], // TODO: flip these middle ones, then replace gather with 128-bit load. Replace extract with store. Also, fix shuffle vectors. - delta_int[y + 0] + 4, - delta_int[y + 1] + 4); + + // Weights are 16-bit, but shuffle will cut out the unnecessary bits. __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); for (int_fast32_t x = 0; x < width; x += 8, p += 8) { + __m128i vp0 = _mm_loadu_si128((__m128i*)(p + delta_int[y + 0])); + __m128i vp1 = _mm_loadu_si128((__m128i*)(p + delta_int[y + 1])); - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + __m256i vp_01 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_01)); + vp_01 = _mm256_inserti128_si256(vp_01, _mm_shuffle_epi8(vp1, p_shuf_01), 1); + + __m256i vp_23 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_23)); + vp_23 = _mm256_inserti128_si256(vp_23, _mm_shuffle_epi8(vp1, p_shuf_23), 1); __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); @@ -1008,10 +1004,7 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - *(uint32_t*)(dst + (y + 0) * width + (x + 0)) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + (x + 0)) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 0) * width + (x + 4)) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 1) * width + (x + 4)) = _mm_extract_epi32(filtered, 3); + _mm_store_si128((__m128i*)(dst + (y * 8)), filtered); } } } From 6dcd9ab9818b2db7e0f3377eefc8bfd43b126436 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 24 Apr 2024 01:35:26 +0300 Subject: [PATCH 160/237] Implement PDPC for intra mode 18. --- src/strategies/avx2/intra-avx2.c | 509 +++++++++++++++++++++++-------- 1 file changed, 387 insertions(+), 122 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a31bfed0..7c63e2f0 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2083,6 +2083,7 @@ static void angular_pdpc_ver_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } + static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -2172,12 +2173,6 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } -static void angular_pdpc_ver_w8_high_angle_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int mode_disp) -{ - -} - - static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m256i v32s = _mm256_set1_epi16(32); @@ -2406,7 +2401,6 @@ static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } } - static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. @@ -3070,6 +3064,293 @@ static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } +static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) +{ + const int limit = MIN(3 << scale, height); + for (int_fast32_t x = 0; x < width; ++x) { + const uvg_pixel ref_top = ref_side[1 + x]; + for (int yy = 0; yy < limit; ++yy) { + const int wT = 32 >> ((yy * 2) >> scale); + const uvg_pixel val = dst[yy * width + x]; + dst[yy * width + x] = CLIP_TO_PIXEL(val + (((ref_top - top_left) * wT + 32) >> 6)); + } + } +} + +static void angular_pdpc_mode18_w4_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 4; + const int limit = MIN(3 << scale, height); + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width + __m256i v32s = _mm256_set1_epi16(32); + + const uint32_t ref4 = *(uint32_t*)&ref_side[1]; + + __m128i vref = _mm_set1_epi32(ref4); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Weight table offset + const int table_offset = scale * 64; + + for (int y = 0, o = 0; y < limit; y += 4, o += 16) { + const int offset = table_offset + o; + + __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m256i vwT = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_hor_weight[offset]); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_mode18_w8_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 8; + + int limit = MIN(3 << scale, height); + + __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width + __m256i v32s = _mm256_set1_epi16(32); + + const uint64_t ref8 = *(uint64_t*)&ref_side[1]; + + __m128i vref = _mm_set1_epi64x(ref8); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Weight table offset + const int table_offset = scale * 128; + + for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { + const __m256i vwT = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_hor_weight[o]); + + __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_mode18_w16_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 16; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); + + __m128i vref = _mm_loadu_si128((const __m128i*)&ref_side[1]); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + + for (int x = 0; x < width; x += 16) { + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); + } + } +} + +static void angular_pdpc_mode18_w32_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 32; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); + + __m128i vrefa = _mm_loadu_si128((const __m128i*) &ref_side[1]); + __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); + + __m128i vrefb = _mm_loadu_si128((const __m128i*) &ref_side[17]); + __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + + // Calculate first half + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + 0))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16a, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + 0)), filtered); + + // Calculate second half + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 16))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + accu = _mm256_sub_epi16(vref16b, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + lo = _mm256_castsi256_si128(accu); + hi = _mm256_extracti128_si256(accu, 1); + filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + 16)), filtered); + } +} + +static void angular_pdpc_mode18_w64_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 64; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); + + __m128i vrefa = _mm_loadu_si128((const __m128i*) &ref_side[0 + 1]); + __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); + + __m128i vrefb = _mm_loadu_si128((const __m128i*) &ref_side[16 + 1]); + __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); + + __m128i vrefc = _mm_loadu_si128((const __m128i*) &ref_side[32 + 1]); + __m256i vref16c = _mm256_cvtepu8_epi16(vrefc); + + __m128i vrefd = _mm_loadu_si128((const __m128i*) &ref_side[48 + 1]); + __m256i vref16d = _mm256_cvtepu8_epi16(vrefd); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + + // Calculate first quarter + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + 0))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16a, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + 0)), filtered); + + // Calculate second quarter + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 16))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + accu = _mm256_sub_epi16(vref16b, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + lo = _mm256_castsi256_si128(accu); + hi = _mm256_extracti128_si256(accu, 1); + filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + 16)), filtered); + + // Calculate third quarter + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 32))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + accu = _mm256_sub_epi16(vref16c, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + lo = _mm256_castsi256_si128(accu); + hi = _mm256_extracti128_si256(accu, 1); + filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + 32)), filtered); + + // Calculate fourth quarter + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 48))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + accu = _mm256_sub_epi16(vref16d, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + lo = _mm256_castsi256_si128(accu); + hi = _mm256_extracti128_si256(accu, 1); + filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + 48)), filtered); + } +} + + +static void angular_pdpc_mode50_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) +{ + const int limit = MIN(3 << scale, width); + for (int y = 0; y < height; ++y) { + const uvg_pixel left = ref_side[1 + y]; + for (int x = 0; x < limit; x++) { + const int wL = 32 >> (2 * x >> scale); + const uvg_pixel val = dst[y * width + x]; + dst[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); + } + } +} + + static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, @@ -3286,11 +3567,7 @@ static void uvg_angular_pred_avx2( } else { // Mode is horizontal or vertical, just copy the pixels. - // NOTE: includes PDPC. - if (vertical_mode) { - const uvg_pixel top_left = ref_main[0]; - int scale = (log2_width + log2_height - 2) >> 2; for (int_fast32_t y = 0; y < height; ++y) { switch (width) { case 4: memcpy(&dst[y * 4], &ref_main[1], 4 * sizeof(uvg_pixel)); break; @@ -3299,25 +3576,9 @@ static void uvg_angular_pred_avx2( case 32: memcpy(&dst[y * 32], &ref_main[1], 32 * sizeof(uvg_pixel)); break; case 64: memcpy(&dst[y * 64], &ref_main[1], 64 * sizeof(uvg_pixel)); break; } - - /*for (int_fast32_t x = 0; x < width; ++x) { - dst[y * width + x] = ref_main[x + 1]; - }*/ - - // PDPC - if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - const uvg_pixel left = ref_side[1 + y]; - for (int i = 0; i < MIN(3 << scale, width); i++) { - const int wL = 32 >> (2 * i >> scale); - const uvg_pixel val = dst[y * width + i]; - dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); - } - } } } else { - const uvg_pixel top_left = ref_main[0]; - int scale = (log2_width + log2_height - 2) >> 2; for (int y = 0; y < height; ++y) { switch (width) { case 4: memset(&dst[y * 4], ref_main[y + 1], 4 * sizeof(uvg_pixel)); break; @@ -3330,121 +3591,125 @@ static void uvg_angular_pred_avx2( break; } } - for (int_fast32_t x = 0; x < width; ++x) { - /*for (int y = 0; y < height; ++y) { - dst[y * width + x] = ref_main[y + 1]; - }*/ - - // PDPC - if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - const uvg_pixel ref_top = ref_side[1 + x]; - for (int yy = 0; yy < MIN(3 << scale, height); ++yy) { - const int wT = 32 >> ((yy * 2) >> scale); - - const uvg_pixel val = dst[yy * width + x]; - dst[yy * width + x] = CLIP_TO_PIXEL(val + (((ref_top - top_left) * wT + 32) >> 6)); - - // pred_samples[x][y] = CLIP((refL[x][y] * wL[x] + refT[x][y] * wT[y] + (64 - wL[x] - wT[y]) * pred_samples[x][y] + 32) >> 6 ) - } - } - } } } - // PDPC for non-horizontal and non-vertical modes - if (!(pred_mode == 18 || pred_mode == 50)) { - bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH); - if (pred_mode > 1 && pred_mode < 67) { - // Disable PDPC filter if both references are used or if MRL is used - if (mode_disp < 0 || multi_ref_index) { - PDPC_filter = false; - } - else if (mode_disp > 0) { - // If scale is negative, PDPC filtering has no effect, therefore disable it. - PDPC_filter &= (scale >= 0); + + bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH); + if (pred_mode > 1 && pred_mode < 67) { + // Disable PDPC filter if both references are used or if MRL is used + if (mode_disp < 0 || multi_ref_index) { + PDPC_filter = false; + } + else if (mode_disp > 0) { + // If scale is negative, PDPC filtering has no effect, therefore disable it. + PDPC_filter &= (scale >= 0); + } + } + if (PDPC_filter) { + // Handle pure horizontal and vertical with separate PDPC solution + if (pred_mode == 18) { + scale = (log2_width + log2_height - 2) >> 2; + const uvg_pixel top_left = ref_main[0]; + + switch (width) { + case 4: angular_pdpc_mode18_w4_avx2(dst, top_left, ref_side, height, scale); break; + case 8: angular_pdpc_mode18_w8_avx2(dst, top_left, ref_side, height, scale); break; + case 16: angular_pdpc_mode18_w16_avx2(dst, top_left, ref_side, height, scale); break; + case 32: angular_pdpc_mode18_w32_avx2(dst, top_left, ref_side, height, scale); break; + case 64: angular_pdpc_mode18_w64_avx2(dst, top_left, ref_side, height, scale); break; + default: + assert(false && "Intra PDPC, invalid width.\n"); + break; } } - if (PDPC_filter) { + else if (pred_mode == 50) { + scale = (log2_width + log2_height - 2) >> 2; + const uvg_pixel top_left = ref_main[0]; + angular_pdpc_mode50_avx2(dst, top_left, ref_side, width, height, scale); + + } + else { if (vertical_mode) { // Note: no need to check for negative mode_disp, since it is already checked before. switch (width) { - case 4: - // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. - // Handle high angles with more tight reference spacing with separate functions with more optimized loads. + case 4: + // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. + // Handle high angles with more tight reference spacing with separate functions with more optimized loads. + if (mode_disp < 6) + angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + break; + case 8: + if (scale == 0) { + if (mode_disp < 6) + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + } + else if (scale == 1) { + if (mode_disp < 8) + angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); + } + else { + if (mode_disp < 10) + angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_ver_8x2_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); + } + break; + case 16: // 16 width and higher done with the same functions + case 32: + case 64: + switch (scale) { + case 0: if (mode_disp < 6) - angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); break; - case 8: - if (scale == 0) { - if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); - else - angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); - } - else if (scale == 1) { - if (mode_disp < 8) - angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); - else - angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); - } - else { - if (mode_disp < 10) - angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); - else - angular_pdpc_ver_8x2_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); - } + case 1: + if (mode_disp < 8) + angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); break; - case 16: // 16 width and higher done with the same functions - case 32: - case 64: - switch (scale) { - case 0: - if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); - else - angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); - break; - case 1: - if (mode_disp < 8) - angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); - else - angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); - break; - case 2: - if (mode_disp < 14) - angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); - else - angular_pdpc_ver_w16_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); - break; - default: - assert(false && "Intra PDPC: Invalid scale.\n"); - } + case 2: + if (mode_disp < 14) + angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_w16_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); break; default: - assert(false && "Intra PDPC: Invalid width.\n"); + assert(false && "Intra PDPC: Invalid scale.\n"); + } + break; + default: + assert(false && "Intra PDPC: Invalid width.\n"); } } else { switch (width) { - case 4: - // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. - // Handle high angles with more tight reference spacing with separate functions with more optimized loads. - if (mode_disp < 6) - angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); - else - angular_pdpc_hor_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); - break; - case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; - case 16: // 16 width and higher done with the same function - case 32: - case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; - default: - assert(false && "Intra PDPC: Invalid width.\n"); + case 4: + // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. + // Handle high angles with more tight reference spacing with separate functions with more optimized loads. + if (mode_disp < 6) + angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_hor_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + break; + case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; + case 16: // 16 width and higher done with the same function + case 32: + case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + default: + assert(false && "Intra PDPC: Invalid width.\n"); } } - } + } } } From 046c814ccace6cffa0e6f570039a50b7a1f215fa Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 24 Apr 2024 15:10:24 +0300 Subject: [PATCH 161/237] Replace memset with set1 and store. --- src/strategies/avx2/intra-avx2.c | 67 ++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7c63e2f0..9b82ab3c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3579,18 +3579,63 @@ static void uvg_angular_pred_avx2( } } else { - for (int y = 0; y < height; ++y) { - switch (width) { - case 4: memset(&dst[y * 4], ref_main[y + 1], 4 * sizeof(uvg_pixel)); break; - case 8: memset(&dst[y * 8], ref_main[y + 1], 8 * sizeof(uvg_pixel)); break; - case 16: memset(&dst[y * 16], ref_main[y + 1], 16 * sizeof(uvg_pixel)); break; - case 32: memset(&dst[y * 32], ref_main[y + 1], 32 * sizeof(uvg_pixel)); break; - case 64: memset(&dst[y * 64], ref_main[y + 1], 64 * sizeof(uvg_pixel)); break; - default: - assert(false && "Intra angular predicion: illegal width.\n"); - break; - } + #define UNROLL(w, h) \ + if ((h) == height && (w) == width) { \ + for (int y = 0; y < (h); ++y) { \ + const __m128i vdst = _mm_set1_epi8(ref_main[y + 1]); \ + switch ((w)) {\ + case 4: _mm_storeu_si32((__m128i*) &dst[y * 4], vdst); break;\ + case 8: _mm_storeu_si64((__m128i*) &dst[y * 8], vdst); break;\ + case 16: _mm_store_si128((__m128i*) &dst[y * 16], vdst); break;\ + case 32:\ + _mm_store_si128((__m128i*) &dst[y * 32 + 0], vdst);\ + _mm_store_si128((__m128i*) &dst[y * 32 + 16], vdst);\ + break;\ + case 64: \ + _mm_store_si128((__m128i*) &dst[y * 64 + 0], vdst);\ + _mm_store_si128((__m128i*) &dst[y * 64 + 16], vdst);\ + _mm_store_si128((__m128i*) &dst[y * 64 + 32], vdst);\ + _mm_store_si128((__m128i*) &dst[y * 64 + 48], vdst);\ + break; \ + default:\ + assert(false && "Intra angular predicion: illegal width.\n");\ + break;\ + }\ + } \ } + UNROLL(4, 4); + UNROLL(4, 8); + UNROLL(4, 16); + UNROLL(4, 32); + UNROLL(4, 64); + UNROLL(8, 2); + UNROLL(8, 4); + UNROLL(8, 8); + UNROLL(8, 16); + UNROLL(8, 32); + UNROLL(8, 64); + UNROLL(16, 1); + UNROLL(16, 2); + UNROLL(16, 4); + UNROLL(16, 8); + UNROLL(16, 16); + UNROLL(16, 32); + UNROLL(16, 64); + UNROLL(32, 1); + UNROLL(32, 2); + UNROLL(32, 4); + UNROLL(32, 8); + UNROLL(32, 16); + UNROLL(32, 32); + UNROLL(32, 64); + UNROLL(64, 1); + UNROLL(64, 2); + UNROLL(64, 4); + UNROLL(64, 8); + UNROLL(64, 16); + UNROLL(64, 32); + UNROLL(64, 64); + #undef UNROLL } } From 9e33a6a797b96782c896c858989ad21e805172d7 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 24 Apr 2024 18:10:03 +0300 Subject: [PATCH 162/237] Implement PDPC for intra mode 50. --- src/strategies/avx2/intra-avx2.c | 133 ++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9b82ab3c..f4223fc7 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3064,6 +3064,7 @@ static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } +// This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) { const int limit = MIN(3 << scale, height); @@ -3337,6 +3338,7 @@ static void angular_pdpc_mode18_w64_avx2(uvg_pixel* dst, const uvg_pixel top_lef } +// This is the non-vectorized version of pdpc mode 50. It is left here for archiving purposes. static void angular_pdpc_mode50_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) { const int limit = MIN(3 << scale, width); @@ -3350,6 +3352,121 @@ static void angular_pdpc_mode50_avx2(uvg_pixel* dst, const uvg_pixel top_left, c } } +static void angular_pdpc_mode50_w4_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 4; + int limit = MIN(3 << scale, width); // Not used + + //__m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2 width + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); + const __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03 + ); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + const uint32_t ref4 = *(uint32_t*)&ref_side[1 + y]; + __m128i vref = _mm_set1_epi32(ref4); + vref = _mm_shuffle_epi8(vref, vshuf); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + //__m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); + __m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_mode50_w8_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 8; + int limit = MIN(3 << scale, width); // Not used. + + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_ver_weight[offset]); + const __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 + ); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + const uint16_t ref2 = *(uint16_t*)&ref_side[1 + y]; + __m128i vref = _mm_set1_epi16(ref2); + vref = _mm_shuffle_epi8(vref, vshuf); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_mode50_w16_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) +{ + int limit = MIN(3 << scale, width); // Not used. + + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 16; + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w16_ver_weight[offset]); + const __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + for (int y = 0; y < height; ++y) { + __m256i vref = _mm256_set1_epi16((int16_t)ref_side[1 + y]); + + __m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + + __m256i accu = _mm256_sub_epi16(vref, vtopleft); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + y * width), filtered); + } +} + static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, @@ -3658,8 +3775,8 @@ static void uvg_angular_pred_avx2( const uvg_pixel top_left = ref_main[0]; switch (width) { - case 4: angular_pdpc_mode18_w4_avx2(dst, top_left, ref_side, height, scale); break; - case 8: angular_pdpc_mode18_w8_avx2(dst, top_left, ref_side, height, scale); break; + case 4: angular_pdpc_mode18_w4_avx2(dst, top_left, ref_side, height, scale); break; + case 8: angular_pdpc_mode18_w8_avx2(dst, top_left, ref_side, height, scale); break; case 16: angular_pdpc_mode18_w16_avx2(dst, top_left, ref_side, height, scale); break; case 32: angular_pdpc_mode18_w32_avx2(dst, top_left, ref_side, height, scale); break; case 64: angular_pdpc_mode18_w64_avx2(dst, top_left, ref_side, height, scale); break; @@ -3671,8 +3788,16 @@ static void uvg_angular_pred_avx2( else if (pred_mode == 50) { scale = (log2_width + log2_height - 2) >> 2; const uvg_pixel top_left = ref_main[0]; - angular_pdpc_mode50_avx2(dst, top_left, ref_side, width, height, scale); - + switch (width) { + case 4: angular_pdpc_mode50_w4_avx2(dst, top_left, ref_side, height, scale); break; + case 8: angular_pdpc_mode50_w8_avx2(dst, top_left, ref_side, height, scale); break; + case 16: // 16 and higher handled by same function. + case 32: + case 64: angular_pdpc_mode50_w16_avx2(dst, top_left, ref_side, width, height, scale); break; + default: + assert(false && "Intra PDPC, invalid width.\n"); + break; + } } else { if (vertical_mode) { From 9dd3d1f83147c721ffdbcf84651cb78e88b5fd67 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 26 Apr 2024 01:34:17 +0300 Subject: [PATCH 163/237] Implement scale1 version of mode 50 pdpc. Reduces the amount of unnecessary work done on 16 width and wider blocks. --- src/strategies/avx2/intra-avx2.c | 60 ++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f4223fc7..0e7105a9 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2136,7 +2136,7 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, int limit = MIN(3 << scale, width); __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - //__m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width + __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width __m256i v32s = _mm256_set1_epi16(32); const int offset = scale * 16; @@ -3467,6 +3467,53 @@ static void angular_pdpc_mode50_w16_avx2(uvg_pixel* dst, const uvg_pixel top_lef } } +static void angular_pdpc_mode50_scale1_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height) +{ + //const int scale = 1; + //int limit = MIN(3 << scale, width); // Not used. + + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = 16; // scale * 16 + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); + const __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + const __m128i vshuf = _mm_setr_epi8( + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 + ); + + const int log2w = uvg_g_convert_to_log2[width]; + __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + __m128i vidx = _mm_slli_epi64(vseq, log2w); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + const uint16_t ref2 = *(uint16_t*)&ref_side[1 + y]; + __m128i vref = _mm_set1_epi16(ref2); + vref = _mm_shuffle_epi8(vref, vshuf); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + //__m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); + __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + //_mm_store_si128((__m128i*)(dst + (y * width)), filtered); + *(uint64_t*)(dst + ((y + 0) * width)) = _mm_extract_epi64(filtered, 0); + *(uint64_t*)(dst + ((y + 1) * width)) = _mm_extract_epi64(filtered, 1); + } +} static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, @@ -3791,9 +3838,16 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pdpc_mode50_w4_avx2(dst, top_left, ref_side, height, scale); break; case 8: angular_pdpc_mode50_w8_avx2(dst, top_left, ref_side, height, scale); break; - case 16: // 16 and higher handled by same function. + case 16: // 16 and higher handled by same functions. case 32: - case 64: angular_pdpc_mode50_w16_avx2(dst, top_left, ref_side, width, height, scale); break; + case 64: + if (scale == 1) { + angular_pdpc_mode50_scale1_avx2(dst, top_left, ref_side, width, height); + } + else { + angular_pdpc_mode50_w16_avx2(dst, top_left, ref_side, width, height, scale); + } + break; default: assert(false && "Intra PDPC, invalid width.\n"); break; From 671bf36f40817aecad3ee64d390bb09b63d5c37c Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 2 May 2024 14:48:12 +0300 Subject: [PATCH 164/237] Fix error with intra mode 16. For some block sizes, the resulting wide angle mode had no entry in wide angle delta int and fract tables. --- src/strategies/avx2/intra-avx2.c | 39 +++++--------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0e7105a9..10af9254 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -235,9 +235,9 @@ ALIGNED(32) static const int16_t delta_fract_table[2112] = { }; -// TODO: cut this table in half due to symmetry // Delta int and delta fract wide angle tables. Rows are corrected prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_wide_angle_table[1792] = { +ALIGNED(32) static const int16_t delta_int_wide_angle_table[960] = { + 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, // 81 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, // -12 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // -10 @@ -252,24 +252,11 @@ ALIGNED(32) static const int16_t delta_int_wide_angle_table[1792] = { 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, // 0 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 1 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 67 - 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, - 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, // 69 - 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, - 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, // 71 - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, - 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, // 73 - 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, - 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, // 75 - 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, - 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, // 77 - 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, - 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, // 79 - 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, }; -// TODO: cut out the latter 32 entries due to symmetry. Also, cut in half due to vertical symmetry -ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1792] = { +// TODO: Can be cut in half due to horizontal symmetry +ALIGNED(32) static const int16_t delta_fract_wide_angle_table[960] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 @@ -284,20 +271,6 @@ ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1792] = { 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, // 0 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 1 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 67 - 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, -13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, // 69 -19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, -25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, // 71 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, // 73 -22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, // 75 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, // 77 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, // 79 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; @@ -3643,7 +3616,7 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 12) * 64 : (80 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); + const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 13) * 64 : (81 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); const int16_t* delta_int = wide_angle_mode ? &delta_int_wide_angle_table[table_offset] : &delta_int_table[table_offset]; const int16_t* delta_fract = wide_angle_mode ? &delta_fract_wide_angle_table[table_offset] : &delta_fract_table[table_offset]; From eb979180bc2d7f391cabfc166c35cc9647374d2b Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 3 May 2024 13:18:37 +0300 Subject: [PATCH 165/237] Move _avx2 identifier in function names to the end of the function name for consistency. --- src/strategies/avx2/intra-avx2.c | 92 ++++++++++++++++---------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 10af9254..a04ed133 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -815,7 +815,7 @@ static const int8_t cubic_filter_8bit_g[32][4] = }; -static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 4; @@ -900,7 +900,7 @@ static void angular_pred_avx2_w4_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 8; @@ -982,7 +982,7 @@ static void angular_pred_avx2_w8_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) +static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) { const __m256i p_shuf_01 = _mm256_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -1076,7 +1076,7 @@ static void angular_pred_avx2_w16_ver(uvg_pixel* dst, const uvg_pixel* ref_main, } -static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 4; @@ -1172,7 +1172,7 @@ static void angular_pred_avx2_w4_hor(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 8; @@ -1225,7 +1225,7 @@ static void angular_pred_avx2_w8_hor(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_avx2_w16_hor(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) +static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) { int8_t f[64][4] = { { 0 } }; if (use_cubic) { @@ -1292,7 +1292,7 @@ static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, c // Linear interpolation filter for width 4 has a different call, since it uses premade tables for coefficients -static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int32_t pred_mode) +static void angular_pred_linear_filter_w4_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int32_t pred_mode) { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); @@ -1340,7 +1340,7 @@ static void angular_pred_avx2_linear_filter_w4_ver(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) +static void angular_pred_linear_filter_w8_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const int width = 8; const __m128i v16s = _mm_set1_epi16(16); @@ -1380,7 +1380,7 @@ static void angular_pred_avx2_linear_filter_w8_ver(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) +static void angular_pred_linear_filter_w16_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const __m128i v16s = _mm_set1_epi16(16); const __m128i vshuf = _mm_setr_epi8( @@ -1417,7 +1417,7 @@ static void angular_pred_avx2_linear_filter_w16_ver(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) +static void angular_pred_linear_filter_w32_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const __m256i v16s = _mm256_set1_epi16(16); const __m256i vshuf = _mm256_setr_epi8( @@ -1460,7 +1460,7 @@ static void angular_pred_avx2_linear_filter_w32_ver(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) +static void angular_pred_linear_filter_w4_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); @@ -1496,7 +1496,7 @@ static void angular_pred_avx2_linear_filter_w4_hor(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) +static void angular_pred_linear_filter_w8_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); @@ -1530,7 +1530,7 @@ static void angular_pred_avx2_linear_filter_w8_hor(uvg_pixel* dst, uvg_pixel* re } -static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) +static void angular_pred_linear_filter_w16_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); @@ -1578,7 +1578,7 @@ static void angular_pred_avx2_linear_filter_w16_hor(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) +static void angular_pred_linear_filter_w32_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; const __m128i v16s = _mm_set1_epi16(16); @@ -1629,7 +1629,7 @@ static void angular_pred_avx2_linear_filter_w32_hor(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_avx2_linear_filter_w8_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_linear_filter_w8_ver_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 8; const int16_t* dint = delta_int; @@ -1668,7 +1668,7 @@ static void angular_pred_avx2_linear_filter_w8_ver_wide_angle(uvg_pixel* dst, uv } -static void angular_pred_avx2_linear_filter_w16_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_linear_filter_w16_ver_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 16; const int16_t* dint = delta_int; @@ -1703,7 +1703,7 @@ static void angular_pred_avx2_linear_filter_w16_ver_wide_angle(uvg_pixel* dst, u } -static void angular_pred_avx2_linear_filter_w32_ver_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_linear_filter_w32_ver_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 32; const int16_t* dint = delta_int; @@ -1741,7 +1741,7 @@ static void angular_pred_avx2_linear_filter_w32_ver_wide_angle(uvg_pixel* dst, u } -static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) +static void angular_pred_linear_filter_w4_hor_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const __m128i v16s = _mm_set1_epi16(16); @@ -1788,7 +1788,7 @@ static void angular_pred_avx2_linear_filter_w4_hor_wide_angle(uvg_pixel* dst, uv } -static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_linear_filter_w8_hor_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 8; const __m128i v16s = _mm_set1_epi16(16); @@ -1864,7 +1864,7 @@ static void angular_pred_avx2_linear_filter_w8_hor_wide_angle(uvg_pixel* dst, uv } -static void angular_pred_avx2_linear_filter_w16_hor_wide_angle(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_linear_filter_w16_hor_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 16; const __m128i v16s = _mm_set1_epi16(16); @@ -1960,7 +1960,7 @@ static void angular_pred_avx2_linear_filter_w16_hor_wide_angle(uvg_pixel* dst, u } -static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) +static void angular_pred_linear_filter_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { // 2-tap linear filter @@ -1974,7 +1974,7 @@ static void angular_pred_avx2_linear_filter_hor(uvg_pixel* dst, uvg_pixel* ref, } -static void angular_pred_avx2_non_fractional_angle_pxl_copy_ver(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) +static void angular_pred_non_fractional_angle_pxl_copy_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { // Note: this probably won't work for wide angle modes. for (int y = 0; y < height; ++y) { @@ -1990,7 +1990,7 @@ static void angular_pred_avx2_non_fractional_angle_pxl_copy_ver(uvg_pixel* dst, } } -static void angular_pred_avx2_non_fractional_angle_pxl_copy_hor(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) +static void angular_pred_non_fractional_angle_pxl_copy_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { // TODO: replace this generic solution after testing for (int y = 0; y < height; ++y) { @@ -3627,11 +3627,11 @@ static void uvg_angular_pred_avx2( if (channel_type == 0) { if (vertical_mode) { switch (width) { - case 4: angular_pred_avx2_w4_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 8: angular_pred_avx2_w8_ver(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 16: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 32: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 64: angular_pred_avx2_w16_ver(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 4: angular_pred_w4_ver_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 8: angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 16: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 32: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 64: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; @@ -3639,11 +3639,11 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: angular_pred_avx2_w4_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 8: angular_pred_avx2_w8_hor(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; - case 16: angular_pred_avx2_w16_hor(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 32: angular_pred_avx2_w16_hor(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 64: angular_pred_avx2_w16_hor(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 4: angular_pred_w4_hor_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 8: angular_pred_w8_hor_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 16: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 32: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 64: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; @@ -3657,10 +3657,10 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { // No wide angle handling for w4 is needed. - case 4: angular_pred_avx2_linear_filter_w4_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 8: angular_pred_avx2_linear_filter_w8_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 16: angular_pred_avx2_linear_filter_w16_ver(dst, ref_main, height, delta_int, pred_mode); break; - case 32: angular_pred_avx2_linear_filter_w32_ver(dst, ref_main, height, delta_int, pred_mode); break; + case 4: angular_pred_linear_filter_w4_ver_avx2(dst, ref_main, height, delta_int, pred_mode); break; + case 8: angular_pred_linear_filter_w8_ver_avx2(dst, ref_main, height, delta_int, pred_mode); break; + case 16: angular_pred_linear_filter_w16_ver_avx2(dst, ref_main, height, delta_int, pred_mode); break; + case 32: angular_pred_linear_filter_w32_ver_avx2(dst, ref_main, height, delta_int, pred_mode); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; @@ -3669,9 +3669,9 @@ static void uvg_angular_pred_avx2( else { if (wide_angle_mode) { switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int); break; - case 8: angular_pred_avx2_linear_filter_w8_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; - case 16: angular_pred_avx2_linear_filter_w16_hor_wide_angle(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 4: angular_pred_linear_filter_w4_hor_wide_angle_avx2(dst, ref_main, height, pred_mode, delta_int); break; + case 8: angular_pred_linear_filter_w8_hor_wide_angle_avx2(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; + case 16: angular_pred_linear_filter_w16_hor_wide_angle_avx2(dst, ref_main, height, pred_mode, delta_int, delta_fract); break; case 32: assert(false && "This code branch only works with UVG_FORMAT_P420."); break; // This branch is never executed with UVG_FORMAT_P420, due to chroma being only 32 width or height. default: assert(false && "Intra angular predicion: illegal chroma width.\n"); @@ -3680,10 +3680,10 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: angular_pred_avx2_linear_filter_w4_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 8: angular_pred_avx2_linear_filter_w8_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 16: angular_pred_avx2_linear_filter_w16_hor(dst, ref_main, height, pred_mode, delta_int); break; - case 32: angular_pred_avx2_linear_filter_w32_hor(dst, ref_main, height, pred_mode, delta_int); break; + case 4: angular_pred_linear_filter_w4_hor_avx2(dst, ref_main, height, pred_mode, delta_int); break; + case 8: angular_pred_linear_filter_w8_hor_avx2(dst, ref_main, height, pred_mode, delta_int); break; + case 16: angular_pred_linear_filter_w16_hor_avx2(dst, ref_main, height, pred_mode, delta_int); break; + case 32: angular_pred_linear_filter_w32_hor_avx2(dst, ref_main, height, pred_mode, delta_int); break; default: assert(false && "Intra angular predicion: illegal chroma width.\n"); break; @@ -3695,10 +3695,10 @@ static void uvg_angular_pred_avx2( else { // No interpolation or filtering needed, just copy the integer samples if (vertical_mode) { - angular_pred_avx2_non_fractional_angle_pxl_copy_ver(dst, ref_main, width, height, delta_int); + angular_pred_non_fractional_angle_pxl_copy_ver_avx2(dst, ref_main, width, height, delta_int); } else { - angular_pred_avx2_non_fractional_angle_pxl_copy_hor(dst, ref_main, width, height, delta_int); + angular_pred_non_fractional_angle_pxl_copy_hor_avx2(dst, ref_main, width, height, delta_int); } } } From f572f20d67af4a252e18343ab35a80efd899f57a Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 10:51:57 +0300 Subject: [PATCH 166/237] Move tables --- src/strategies/avx2/intra-avx2.c | 221 +------ src/strategies/avx2/intra_avx2_tables.h | 222 +++++++ src/strategies/avx2/mip_data_avx2.h | 604 ++++++++++++++++++ src/strategies/generic/intra-generic.c | 2 +- .../generic/mip_data_generic.h} | 560 +--------------- 5 files changed, 829 insertions(+), 780 deletions(-) create mode 100644 src/strategies/avx2/mip_data_avx2.h rename src/{mip_data.h => strategies/generic/mip_data_generic.h} (55%) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a04ed133..debaf417 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -48,231 +48,12 @@ #include "global.h" #include "intra-avx2.h" #include "intra_avx2_tables.h" -#include "mip_data.h" +#include "strategies/avx2/mip_data_avx2.h" #include "uvg_math.h" #include "strategyselector.h" #include "strategies/missing-intel-intrinsics.h" -// Y coord tables -ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { - 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, // offset 0, line == 64 - 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, - 55, 9, 55, 9, 55, 9, 55, 9, 54, 10, 54, 10, 54, 10, 54, 10, 53, 11, 53, 11, 53, 11, 53, 11, 52, 12, 52, 12, 52, 12, 52, 12, - 51, 13, 51, 13, 51, 13, 51, 13, 50, 14, 50, 14, 50, 14, 50, 14, 49, 15, 49, 15, 49, 15, 49, 15, 48, 16, 48, 16, 48, 16, 48, 16, - 47, 17, 47, 17, 47, 17, 47, 17, 46, 18, 46, 18, 46, 18, 46, 18, 45, 19, 45, 19, 45, 19, 45, 19, 44, 20, 44, 20, 44, 20, 44, 20, - 43, 21, 43, 21, 43, 21, 43, 21, 42, 22, 42, 22, 42, 22, 42, 22, 41, 23, 41, 23, 41, 23, 41, 23, 40, 24, 40, 24, 40, 24, 40, 24, - 39, 25, 39, 25, 39, 25, 39, 25, 38, 26, 38, 26, 38, 26, 38, 26, 37, 27, 37, 27, 37, 27, 37, 27, 36, 28, 36, 28, 36, 28, 36, 28, - 35, 29, 35, 29, 35, 29, 35, 29, 34, 30, 34, 30, 34, 30, 34, 30, 33, 31, 33, 31, 33, 31, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 33, 31, 33, 31, 33, 31, 33, 30, 34, 30, 34, 30, 34, 30, 34, 29, 35, 29, 35, 29, 35, 29, 35, 28, 36, 28, 36, 28, 36, 28, 36, - 27, 37, 27, 37, 27, 37, 27, 37, 26, 38, 26, 38, 26, 38, 26, 38, 25, 39, 25, 39, 25, 39, 25, 39, 24, 40, 24, 40, 24, 40, 24, 40, - 23, 41, 23, 41, 23, 41, 23, 41, 22, 42, 22, 42, 22, 42, 22, 42, 21, 43, 21, 43, 21, 43, 21, 43, 20, 44, 20, 44, 20, 44, 20, 44, - 19, 45, 19, 45, 19, 45, 19, 45, 18, 46, 18, 46, 18, 46, 18, 46, 17, 47, 17, 47, 17, 47, 17, 47, 16, 48, 16, 48, 16, 48, 16, 48, - 15, 49, 15, 49, 15, 49, 15, 49, 14, 50, 14, 50, 14, 50, 14, 50, 13, 51, 13, 51, 13, 51, 13, 51, 12, 52, 12, 52, 12, 52, 12, 52, - 11, 53, 11, 53, 11, 53, 11, 53, 10, 54, 10, 54, 10, 54, 10, 54, 9, 55, 9, 55, 9, 55, 9, 55, 8, 56, 8, 56, 8, 56, 8, 56, - 7, 57, 7, 57, 7, 57, 7, 57, 6, 58, 6, 58, 6, 58, 6, 58, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, - 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, - 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, // offset 16, line == 32 - 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, - 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, - 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, - 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, - 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, - 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, - 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 1, 31, 1, 31, 1, 31, 1, 31, 0, 32, 0, 32, 0, 32, 0, 32, - 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, // offset 24, line == 16 - 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, - 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 1, 15, 1, 15, 1, 15, 1, 15, 0, 16, 0, 16, 0, 16, 0, 16, - 7, 1, 7, 1, 7, 1, 7, 1, 6, 2, 6, 2, 6, 2, 6, 2, 5, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, // offset 28, line == 8 - 3, 5, 3, 5, 3, 5, 3, 5, 2, 6, 2, 6, 2, 6, 2, 6, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, - 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, // offset 30, line == 4 - 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 31. line == 2 -}; - -ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { - 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, // offset 0, line == 64 - 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, - 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, - 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, - 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, - 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, - 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, - 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, - 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, - 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, - 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, - 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, - 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, - 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, - 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, - 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, - 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, - 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, - 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, - 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, - 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, - 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, - 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, - 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, - 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, - 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, - 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, - 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, - 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, - 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, - 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, - 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, // offset 32, line == 32 - 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, - 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, - 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, - 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, - 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, - 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, - 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, - 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, - 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, - 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, - 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, - 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, - 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, - 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, - 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, - 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, // offset 48, line == 16 - 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, - 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, // offset 56, line == 8 - 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, - 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, - 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // offset 60, line == 4 - 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // offset 62, line == 2 - 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // offset 64, line == 1, this might not be needed, ever -}; - - -// Delta int and delta fract tables. Rows are prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_table[2112] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 Diagonal mode - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 6 - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 10 - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 14 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal and vertical mode - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 22 - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 26 - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 30 - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 Diagonal mode -}; - - -// TODO: cut this table to 32 width, the second 32 width half is identical to the first -ALIGNED(32) static const int16_t delta_fract_table[2112] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode -29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, -26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, -23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, -20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 -18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, -16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, -14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, -12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 -10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal & vertical mode -31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, -29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, -28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 -26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, -24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, -22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, -20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 -18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, -16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, -14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, -12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode -}; - - -// Delta int and delta fract wide angle tables. Rows are corrected prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_wide_angle_table[960] = { - 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, // 81 - 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, // -12 - 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, - 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // -10 - 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, - 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, // -8 - 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, - 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, // -6 - 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, // -4 - 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, - 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, // -2 - 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, - 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, // 0 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 1 -}; - -// TODO: Can be cut in half due to horizontal symmetry -ALIGNED(32) static const int16_t delta_fract_wide_angle_table[960] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 81 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 -21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 -11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -8 - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, -22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // -6 - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -4 -25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, -19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, // -2 -13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, - 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, // 0 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 1 -}; - /** * \brief Generate angular predictions. diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 7829e842..8852eabc 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -2084,4 +2084,226 @@ ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, // Mode disp 31 }; + + +// Y coord tables +ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { + 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, // offset 0, line == 64 + 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, + 55, 9, 55, 9, 55, 9, 55, 9, 54, 10, 54, 10, 54, 10, 54, 10, 53, 11, 53, 11, 53, 11, 53, 11, 52, 12, 52, 12, 52, 12, 52, 12, + 51, 13, 51, 13, 51, 13, 51, 13, 50, 14, 50, 14, 50, 14, 50, 14, 49, 15, 49, 15, 49, 15, 49, 15, 48, 16, 48, 16, 48, 16, 48, 16, + 47, 17, 47, 17, 47, 17, 47, 17, 46, 18, 46, 18, 46, 18, 46, 18, 45, 19, 45, 19, 45, 19, 45, 19, 44, 20, 44, 20, 44, 20, 44, 20, + 43, 21, 43, 21, 43, 21, 43, 21, 42, 22, 42, 22, 42, 22, 42, 22, 41, 23, 41, 23, 41, 23, 41, 23, 40, 24, 40, 24, 40, 24, 40, 24, + 39, 25, 39, 25, 39, 25, 39, 25, 38, 26, 38, 26, 38, 26, 38, 26, 37, 27, 37, 27, 37, 27, 37, 27, 36, 28, 36, 28, 36, 28, 36, 28, + 35, 29, 35, 29, 35, 29, 35, 29, 34, 30, 34, 30, 34, 30, 34, 30, 33, 31, 33, 31, 33, 31, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 33, 31, 33, 31, 33, 31, 33, 30, 34, 30, 34, 30, 34, 30, 34, 29, 35, 29, 35, 29, 35, 29, 35, 28, 36, 28, 36, 28, 36, 28, 36, + 27, 37, 27, 37, 27, 37, 27, 37, 26, 38, 26, 38, 26, 38, 26, 38, 25, 39, 25, 39, 25, 39, 25, 39, 24, 40, 24, 40, 24, 40, 24, 40, + 23, 41, 23, 41, 23, 41, 23, 41, 22, 42, 22, 42, 22, 42, 22, 42, 21, 43, 21, 43, 21, 43, 21, 43, 20, 44, 20, 44, 20, 44, 20, 44, + 19, 45, 19, 45, 19, 45, 19, 45, 18, 46, 18, 46, 18, 46, 18, 46, 17, 47, 17, 47, 17, 47, 17, 47, 16, 48, 16, 48, 16, 48, 16, 48, + 15, 49, 15, 49, 15, 49, 15, 49, 14, 50, 14, 50, 14, 50, 14, 50, 13, 51, 13, 51, 13, 51, 13, 51, 12, 52, 12, 52, 12, 52, 12, 52, + 11, 53, 11, 53, 11, 53, 11, 53, 10, 54, 10, 54, 10, 54, 10, 54, 9, 55, 9, 55, 9, 55, 9, 55, 8, 56, 8, 56, 8, 56, 8, 56, + 7, 57, 7, 57, 7, 57, 7, 57, 6, 58, 6, 58, 6, 58, 6, 58, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, + 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, + 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, // offset 16, line == 32 + 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, + 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, + 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, + 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, + 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, + 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, + 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 1, 31, 1, 31, 1, 31, 1, 31, 0, 32, 0, 32, 0, 32, 0, 32, + 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, // offset 24, line == 16 + 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, + 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 1, 15, 1, 15, 1, 15, 1, 15, 0, 16, 0, 16, 0, 16, 0, 16, + 7, 1, 7, 1, 7, 1, 7, 1, 6, 2, 6, 2, 6, 2, 6, 2, 5, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, // offset 28, line == 8 + 3, 5, 3, 5, 3, 5, 3, 5, 2, 6, 2, 6, 2, 6, 2, 6, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, + 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, // offset 30, line == 4 + 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 2, 0, 2, // offset 31. line == 2 +}; + +ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { + 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, // offset 0, line == 64 + 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, + 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, + 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, + 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, + 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, + 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, + 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, + 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, + 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, + 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, + 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, + 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, + 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, + 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, + 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 31, 33, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, 30, 34, + 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 29, 35, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, 28, 36, + 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 27, 37, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, 26, 38, + 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 25, 39, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, 24, 40, + 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 23, 41, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, 22, 42, + 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 21, 43, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, 20, 44, + 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 19, 45, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, 18, 46, + 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 17, 47, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, + 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 15, 49, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, 14, 50, + 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 13, 51, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, 12, 52, + 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 11, 53, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, 10, 54, + 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 9, 55, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, + 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 7, 57, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, 6, 58, + 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 5, 59, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, + 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 3, 61, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, + 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, + 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, // offset 32, line == 32 + 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, + 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, + 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, + 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, + 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, + 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, + 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, + 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, + 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, + 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, + 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, + 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, + 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, + 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, + 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, // offset 48, line == 16 + 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, + 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, // offset 56, line == 8 + 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, + 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, + 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // offset 60, line == 4 + 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // offset 62, line == 2 + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, // offset 64, line == 1, this might not be needed, ever +}; + + +// Delta int and delta fract tables. Rows are prediction mode, columns y offset. (or x offset for horizontal modes) +ALIGNED(32) static const int16_t delta_int_table[2112] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 Diagonal mode + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 6 + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 10 + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 14 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal and vertical mode + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 22 + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 26 + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 30 + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 Diagonal mode +}; + + +// TODO: cut this table to 32 width, the second 32 width half is identical to the first +ALIGNED(32) static const int16_t delta_fract_table[2112] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 +10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal & vertical mode +31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, +30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, +28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode +}; + + +// Delta int and delta fract wide angle tables. Rows are corrected prediction mode, columns y offset. (or x offset for horizontal modes) +ALIGNED(32) static const int16_t delta_int_wide_angle_table[960] = { + 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, // 81 + 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, // -12 + 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, + 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // -10 + 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, + 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, // -8 + 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, + 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, // -6 + 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, // -4 + 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, + 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, // -2 + 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, + 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, // 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 1 +}; + +// TODO: Can be cut in half due to horizontal symmetry +ALIGNED(32) static const int16_t delta_fract_wide_angle_table[960] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 81 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 +21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 +11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -8 + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // -6 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -4 +25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, +19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, // -2 +13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, + 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, // 0 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 1 +}; + + #endif INTRA_AVX2_TABLES_H diff --git a/src/strategies/avx2/mip_data_avx2.h b/src/strategies/avx2/mip_data_avx2.h new file mode 100644 index 00000000..342b1b0c --- /dev/null +++ b/src/strategies/avx2/mip_data_avx2.h @@ -0,0 +1,604 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + + /** +* \ingroup Reconstruction +* \file +* MIP weight matrix data. +*/ + +/** \file MipData.h +\brief weight and bias data for matrix-based intra prediction (MIP) +*/ +#pragma once + +#define MIP_SHIFT_MATRIX 6 +#define MIP_OFFSET_MATRIX 32 +// MIP weight tables for AVX2. + +// This is the same table as used in generic version, but 16-bit. +static ALIGNED(32) const uint16_t uvg_mip_sid0_weights[16][16][4] = +{ + { + { 32, 30, 90, 28}, + { 32, 32, 72, 28}, + { 34, 77, 53, 30}, + { 51, 124, 36, 37}, + { 31, 31, 95, 37}, + { 33, 31, 70, 50}, + { 52, 80, 25, 60}, + { 78, 107, 1, 65}, + { 31, 29, 37, 95}, + { 38, 34, 19, 101}, + { 73, 85, 0, 81}, + { 92, 99, 0, 65}, + { 34, 29, 14, 111}, + { 48, 48, 7, 100}, + { 80, 91, 0, 74}, + { 89, 97, 0, 64} + }, + { + { 31, 23, 34, 29}, + { 31, 43, 34, 31}, + { 30, 95, 34, 32}, + { 29, 100, 35, 33}, + { 31, 23, 34, 29}, + { 31, 43, 34, 31}, + { 30, 95, 34, 32}, + { 29, 99, 35, 33}, + { 31, 24, 35, 29}, + { 31, 44, 34, 31}, + { 30, 95, 35, 32}, + { 29, 99, 35, 33}, + { 31, 24, 35, 30}, + { 31, 44, 35, 31}, + { 30, 95, 35, 32}, + { 29, 99, 35, 33} + }, + { + { 32, 32, 36, 58}, + { 32, 29, 26, 66}, + { 36, 37, 23, 61}, + { 79, 84, 3, 37}, + { 32, 32, 30, 69}, + { 33, 29, 24, 71}, + { 44, 16, 21, 70}, + { 96, 18, 0, 57}, + { 32, 31, 24, 74}, + { 33, 30, 23, 71}, + { 36, 24, 24, 71}, + { 59, 9, 16, 68}, + { 32, 32, 23, 75}, + { 33, 30, 24, 70}, + { 32, 30, 25, 71}, + { 36, 26, 25, 70} + }, + { + { 32, 33, 34, 32}, + { 32, 30, 22, 38}, + { 29, 46, 25, 38}, + { 53, 123, 28, 22}, + { 32, 33, 30, 37}, + { 32, 30, 21, 38}, + { 32, 40, 24, 38}, + { 64, 116, 26, 17}, + { 32, 32, 23, 49}, + { 32, 30, 21, 39}, + { 34, 39, 24, 37}, + { 72, 109, 23, 16}, + { 33, 31, 17, 60}, + { 32, 31, 21, 39}, + { 35, 41, 24, 37}, + { 72, 106, 22, 18} + }, + { + { 34, 25, 89, 20}, + { 38, 32, 47, 24}, + { 40, 86, 29, 27}, + { 38, 98, 32, 29}, + { 34, 31, 94, 40}, + { 44, 25, 83, 27}, + { 54, 72, 43, 16}, + { 47, 94, 33, 22}, + { 33, 31, 36, 94}, + { 43, 23, 51, 76}, + { 62, 55, 64, 25}, + { 57, 89, 38, 15}, + { 32, 32, 28, 101}, + { 38, 26, 33, 94}, + { 55, 38, 68, 47}, + { 59, 80, 52, 16} + }, + { + { 28, 30, 68, 29}, + { 23, 48, 23, 48}, + { 39, 98, 16, 42}, + { 84, 86, 20, 17}, + { 25, 31, 52, 74}, + { 38, 68, 5, 70}, + { 95, 78, 7, 21}, + { 127, 54, 12, 0}, + { 30, 47, 14, 107}, + { 79, 76, 0, 53}, + { 127, 59, 7, 1}, + { 127, 51, 9, 0}, + { 50, 71, 1, 96}, + { 109, 69, 7, 25}, + { 127, 56, 9, 0}, + { 123, 53, 13, 0} + }, + { + { 40, 20, 72, 18}, + { 48, 29, 44, 18}, + { 53, 81, 35, 18}, + { 48, 96, 33, 22}, + { 45, 23, 79, 49}, + { 61, 21, 56, 49}, + { 72, 52, 32, 48}, + { 65, 69, 20, 50}, + { 41, 27, 29, 96}, + { 49, 22, 28, 94}, + { 52, 22, 28, 93}, + { 49, 27, 27, 92}, + { 37, 29, 26, 98}, + { 39, 28, 28, 97}, + { 38, 28, 30, 97}, + { 38, 29, 30, 95} + }, + { + { 33, 27, 43, 27}, + { 32, 29, 31, 31}, + { 31, 73, 33, 31}, + { 35, 104, 34, 28}, + { 32, 30, 63, 22}, + { 33, 26, 33, 29}, + { 33, 57, 33, 30}, + { 37, 100, 35, 27}, + { 32, 31, 85, 25}, + { 34, 25, 39, 25}, + { 35, 39, 32, 28}, + { 40, 91, 35, 25}, + { 32, 30, 77, 50}, + { 34, 26, 54, 22}, + { 37, 31, 34, 27}, + { 45, 75, 34, 23} + }, + { + { 34, 25, 77, 19}, + { 36, 34, 56, 24}, + { 41, 83, 39, 30}, + { 47, 96, 28, 35}, + { 34, 31, 70, 65}, + { 38, 29, 53, 77}, + { 43, 36, 37, 83}, + { 48, 39, 28, 83}, + { 33, 31, 31, 98}, + { 33, 31, 30, 99}, + { 34, 30, 31, 98}, + { 36, 29, 31, 96}, + { 32, 32, 30, 97}, + { 32, 32, 31, 96}, + { 31, 33, 33, 96}, + { 32, 33, 34, 94} + }, + { + { 30, 30, 93, 19}, + { 31, 59, 67, 34}, + { 31, 79, 36, 59}, + { 30, 67, 17, 79}, + { 30, 38, 68, 69}, + { 29, 40, 43, 91}, + { 26, 35, 32, 101}, + { 23, 32, 30, 101}, + { 26, 34, 30, 101}, + { 23, 33, 30, 102}, + { 20, 32, 31, 102}, + { 18, 33, 32, 102}, + { 23, 33, 31, 100}, + { 20, 34, 32, 100}, + { 18, 35, 33, 100}, + { 18, 35, 33, 100} + }, + { + { 31, 54, 90, 26}, + { 32, 60, 53, 61}, + { 34, 49, 37, 84}, + { 34, 39, 35, 89}, + { 35, 38, 41, 88}, + { 35, 35, 32, 96}, + { 35, 31, 33, 96}, + { 35, 32, 35, 94}, + { 34, 34, 30, 97}, + { 35, 32, 33, 95}, + { 35, 32, 34, 94}, + { 35, 34, 34, 93}, + { 34, 34, 34, 93}, + { 35, 34, 34, 93}, + { 35, 34, 34, 92}, + { 36, 34, 35, 91} + }, + { + { 32, 29, 54, 24}, + { 31, 32, 34, 29}, + { 31, 43, 34, 29}, + { 32, 67, 36, 28}, + { 31, 34, 69, 37}, + { 31, 35, 46, 33}, + { 30, 35, 39, 33}, + { 30, 42, 39, 36}, + { 31, 35, 39, 88}, + { 30, 38, 41, 84}, + { 30, 39, 40, 81}, + { 39, 46, 38, 78}, + { 31, 36, 34, 96}, + { 34, 38, 37, 93}, + { 55, 42, 38, 82}, + { 89, 53, 38, 65} + }, + { + { 32, 33, 43, 29}, + { 32, 30, 29, 33}, + { 31, 47, 31, 33}, + { 33, 100, 31, 31}, + { 32, 33, 74, 25}, + { 32, 32, 34, 31}, + { 32, 33, 30, 33}, + { 32, 68, 30, 32}, + { 32, 31, 91, 40}, + { 32, 32, 58, 26}, + { 31, 31, 30, 32}, + { 31, 42, 30, 33}, + { 32, 31, 49, 85}, + { 32, 31, 83, 35}, + { 31, 33, 48, 29}, + { 31, 36, 32, 33} + }, + { + { 31, 29, 81, 35}, + { 32, 28, 34, 50}, + { 31, 75, 16, 43}, + { 34, 103, 29, 32}, + { 32, 32, 53, 78}, + { 31, 28, 36, 88}, + { 30, 52, 18, 73}, + { 52, 88, 17, 35}, + { 32, 32, 35, 94}, + { 30, 31, 35, 95}, + { 36, 29, 31, 92}, + { 100, 43, 16, 40}, + { 32, 32, 35, 93}, + { 30, 32, 38, 93}, + { 55, 18, 37, 83}, + { 127, 0, 30, 40} + }, + { + { 31, 22, 47, 30}, + { 31, 48, 25, 34}, + { 30, 95, 31, 32}, + { 32, 103, 33, 32}, + { 30, 24, 57, 31}, + { 30, 47, 26, 34}, + { 31, 95, 31, 32}, + { 43, 97, 35, 25}, + { 29, 26, 44, 63}, + { 37, 38, 24, 47}, + { 74, 63, 28, 20}, + { 110, 58, 34, 3}, + { 46, 22, 5, 108}, + { 93, 5, 9, 77}, + { 127, 0, 17, 52}, + { 127, 0, 15, 50} + }, + { + { 32, 27, 68, 24}, + { 35, 23, 35, 28}, + { 35, 64, 29, 29}, + { 37, 104, 33, 28}, + { 32, 32, 91, 40}, + { 36, 23, 67, 36}, + { 49, 23, 39, 28}, + { 60, 67, 30, 20}, + { 32, 32, 36, 95}, + { 35, 29, 38, 93}, + { 50, 16, 30, 84}, + { 72, 16, 15, 65}, + { 32, 32, 27, 100}, + { 33, 32, 29, 100}, + { 37, 29, 30, 98}, + { 48, 21, 29, 90} + } +}; + +// Weight vectors for MIP size_id 1. +static ALIGNED(32) const uint16_t uvg_mip_sid1_weights[] = { + 30, 63, 30, 60, 29, 45, 30, 39, 46, 37, 66, 38, 74, 42, 62, 58, // mode 0, offset 0 + 25, 33, 32, 31, 32, 32, 32, 33, 33, 34, 32, 33, 32, 33, 32, 33, + 30, 66, 29, 54, 28, 48, 28, 41, 55, 39, 69, 40, 71, 43, 72, 46, + 32, 30, 33, 31, 32, 33, 32, 34, 30, 36, 31, 33, 32, 33, 32, 33, + 30, 66, 29, 55, 27, 46, 27, 42, 56, 40, 69, 39, 72, 43, 69, 48, + 32, 33, 33, 33, 33, 33, 32, 34, 28, 33, 30, 32, 32, 33, 32, 33, + 30, 63, 29, 56, 27, 47, 27, 42, 55, 40, 66, 40, 69, 44, 65, 50, + 32, 33, 33, 33, 33, 33, 32, 34, 35, 30, 33, 30, 33, 32, 32, 33, + 32, 33, 33, 56, 33, 77, 33, 37, 30, 31, 28, 30, 52, 26, 80, 41, // mode 1, offset 128 + 74, 30, 41, 29, 29, 34, 31, 34, 31, 32, 32, 32, 30, 32, 30, 32, + 32, 32, 33, 31, 33, 47, 33, 61, 33, 31, 31, 30, 28, 29, 44, 28, + 59, 76, 78, 40, 53, 27, 34, 32, 28, 31, 28, 32, 31, 31, 31, 31, + 32, 31, 32, 31, 33, 27, 33, 33, 34, 30, 34, 29, 34, 29, 34, 30, + 26, 64, 45, 86, 73, 55, 62, 33, 76, 27, 36, 29, 25, 32, 30, 31, + 32, 31, 32, 31, 32, 30, 33, 28, 34, 30, 35, 29, 36, 29, 37, 30, + 30, 29, 27, 53, 40, 80, 58, 60, 58, 74, 77, 35, 44, 31, 31, 33, + 32, 51, 32, 95, 32, 27, 32, 34, 27, 32, 42, 29, 99, 34, 21, 104, // mode 2, offset 256 + 27, 50, 29, 42, 31, 41, 31, 42, 29, 32, 30, 32, 29, 32, 30, 32, + 32, 45, 32, 77, 32, 38, 32, 30, 30, 32, 38, 30, 78, 33, 30, 87, + 9, 88, 9, 76, 14, 67, 20, 59, 40, 30, 38, 30, 37, 30, 38, 31, + 33, 37, 34, 44, 36, 39, 37, 31, 32, 32, 34, 31, 45, 31, 31, 54, + 27, 18, 25, 17, 24, 15, 25, 14, 106, 34, 108, 31, 108, 30, 101, 32, + 36, 33, 39, 32, 44, 33, 47, 30, 32, 30, 32, 29, 31, 27, 31, 32, + 29, 37, 27, 37, 25, 37, 25, 34, 13, 110, 15, 108, 16, 106, 19, 102, + 32, 48, 32, 33, 32, 29, 33, 33, 35, 35, 59, 40, 47, 65, 31, 81, // mode 3, offset 384 + 47, 68, 27, 71, 24, 62, 26, 50, 31, 31, 33, 30, 37, 30, 42, 32, + 32, 30, 32, 20, 33, 30, 36, 34, 40, 38, 46, 50, 29, 66, 27, 69, + 30, 70, 26, 55, 25, 41, 26, 31, 55, 31, 64, 31, 72, 33, 67, 39, + 33, 28, 36, 27, 43, 30, 51, 27, 36, 40, 33, 50, 26, 57, 28, 55, + 30, 26, 31, 20, 28, 17, 22, 23, 85, 47, 79, 53, 67, 62, 49, 70, + 38, 29, 51, 31, 69, 23, 77, 13, 32, 39, 28, 43, 30, 40, 35, 38, + 28, 30, 24, 31, 15, 38, 8, 43, 22, 104, 17, 102, 10, 95, 8, 90, + 32, 38, 32, 40, 32, 37, 33, 34, 32, 33, 37, 32, 46, 35, 30, 62, // mode 4, offset 512 +101, 40, 100, 36, 94, 33, 81, 35, 29, 32, 30, 32, 30, 31, 30, 31, + 32, 32, 32, 31, 33, 33, 33, 32, 33, 32, 33, 33, 33, 33, 34, 36, + 22, 102, 26, 104, 31, 103, 37, 94, 39, 29, 34, 28, 32, 28, 33, 28, + 32, 33, 32, 34, 33, 33, 33, 33, 32, 32, 33, 33, 34, 33, 33, 36, + 34, 24, 33, 30, 31, 37, 30, 46, 99, 36, 98, 32, 95, 29, 85, 31, + 32, 33, 32, 34, 32, 33, 33, 33, 32, 33, 33, 33, 34, 34, 32, 37, + 30, 34, 31, 32, 31, 29, 32, 30, 23, 104, 30, 98, 39, 91, 47, 82, + 32, 52, 33, 19, 33, 30, 34, 35, 48, 31, 62, 50, 20, 74, 23, 56, // mode 5, offset 640 + 38, 76, 25, 50, 29, 29, 31, 25, 26, 32, 51, 31, 54, 51, 41, 76, + 33, 25, 35, 28, 37, 35, 38, 32, 38, 39, 25, 47, 22, 38, 33, 29, + 28, 39, 31, 23, 31, 27, 30, 31, 83, 35, 57, 74, 30, 101, 27, 103, + 34, 32, 38, 33, 40, 32, 40, 32, 27, 37, 28, 32, 33, 27, 34, 27, + 32, 25, 30, 31, 29, 33, 28, 33, 41, 92, 18, 111, 18, 111, 23, 105, + 35, 32, 38, 31, 40, 32, 40, 32, 30, 33, 33, 30, 33, 29, 33, 30, + 31, 33, 29, 33, 29, 34, 29, 34, 20, 107, 21, 106, 22, 105, 24, 101, + 32, 28, 33, 30, 33, 60, 33, 63, 31, 33, 28, 33, 26, 33, 44, 36, // mode 6, offset 768 + 92, 33, 71, 26, 47, 28, 37, 31, 30, 31, 32, 30, 33, 30, 33, 30, + 33, 30, 33, 28, 33, 30, 33, 38, 31, 33, 29, 34, 26, 33, 29, 32, + 43, 90, 71, 71, 86, 45, 74, 32, 33, 29, 26, 30, 28, 30, 33, 29, + 33, 32, 34, 31, 34, 31, 33, 32, 30, 32, 29, 33, 29, 33, 28, 34, + 29, 41, 26, 71, 37, 88, 55, 75, 95, 27, 73, 22, 46, 25, 36, 28, + 34, 31, 35, 32, 34, 33, 34, 34, 30, 32, 28, 33, 28, 33, 28, 34, + 33, 27, 33, 23, 30, 35, 33, 53, 43, 89, 77, 59, 91, 37, 74, 31, + 33, 49, 33, 71, 32, 23, 31, 33, 26, 32, 72, 24, 70, 68, 21, 106, // mode 7, offset 896 + 26, 52, 30, 32, 32, 32, 33, 32, 28, 31, 34, 31, 32, 32, 32, 33, + 34, 47, 34, 44, 32, 27, 30, 33, 32, 29, 89, 28, 46, 89, 20, 107, + 5, 86, 28, 37, 33, 31, 33, 33, 44, 26, 33, 30, 31, 32, 32, 33, + 35, 39, 34, 27, 31, 31, 29, 32, 42, 27, 87, 43, 32, 100, 22, 106, + 26, 24, 30, 34, 32, 33, 33, 33, 92, 35, 38, 31, 30, 32, 32, 33, + 35, 29, 34, 24, 31, 33, 29, 33, 47, 32, 69, 60, 31, 99, 25, 103, + 32, 32, 34, 33, 32, 33, 33, 33, 17, 100, 28, 44, 32, 31, 32, 35, +}; + + +// Weight vectors for MIP size_id 2. +static ALIGNED(32) const uint16_t uvg_mip_sid2_weights[] = { + 0, 42, 0, 71, 0, 77, 0, 64, 37, 33, 39, 34, 46, 35, 60, 35, // mode 0, offset 0 + 27, 44, 24, 36, 33, 30, 33, 31, 33, 35, 35, 36, 34, 36, 32, 36, + 0, 49, 0, 42, 0, 40, 0, 38, 71, 38, 66, 50, 52, 67, 43, 75, + 32, 32, 33, 31, 33, 31, 33, 32, 31, 36, 32, 36, 32, 35, 32, 35, + 0, 56, 0, 70, 0, 65, 0, 59, 40, 33, 49, 34, 57, 36, 60, 39, + 26, 43, 30, 28, 34, 28, 33, 30, 38, 36, 38, 38, 33, 39, 31, 38, + 0, 55, 0, 51, 0, 46, 0, 42, 60, 43, 61, 47, 62, 51, 60, 55, + 33, 30, 33, 30, 34, 30, 33, 31, 31, 38, 32, 37, 32, 37, 32, 37, + 0, 60, 0, 68, 0, 62, 0, 58, 42, 34, 52, 35, 58, 37, 59, 41, + 30, 37, 35, 22, 34, 28, 33, 30, 43, 38, 37, 40, 31, 40, 30, 39, + 0, 56, 0, 53, 0, 49, 0, 45, 59, 44, 60, 45, 65, 45, 64, 47, + 34, 30, 33, 30, 33, 30, 33, 31, 31, 38, 31, 38, 31, 38, 32, 38, + 0, 59, 0, 66, 0, 61, 0, 59, 44, 35, 53, 36, 58, 38, 57, 41, + 31, 34, 35, 25, 34, 29, 33, 30, 43, 41, 31, 43, 30, 40, 31, 39, + 0, 57, 0, 54, 0, 51, 0, 48, 58, 43, 61, 43, 64, 43, 64, 45, + 33, 30, 33, 31, 33, 31, 33, 32, 31, 39, 31, 39, 31, 39, 31, 39, + 0, 57, 0, 65, 0, 63, 0, 61, 45, 35, 54, 37, 56, 38, 56, 41, + 30, 35, 33, 33, 34, 30, 34, 30, 40, 44, 24, 44, 29, 39, 32, 39, + 0, 58, 0, 54, 0, 51, 0, 48, 58, 42, 62, 41, 65, 42, 63, 43, + 33, 31, 33, 31, 33, 31, 33, 32, 31, 39, 31, 39, 31, 39, 31, 39, + 0, 55, 0, 65, 0, 65, 0, 63, 46, 35, 53, 37, 54, 38, 55, 39, + 30, 36, 32, 36, 33, 31, 33, 30, 38, 47, 26, 40, 30, 38, 32, 38, + 0, 59, 0, 54, 0, 49, 0, 48, 58, 40, 64, 40, 66, 40, 64, 42, + 33, 31, 33, 31, 32, 32, 32, 32, 31, 39, 30, 40, 30, 41, 30, 41, + 0, 54, 0, 64, 0, 65, 0, 63, 46, 35, 52, 36, 53, 37, 55, 38, + 30, 34, 32, 34, 33, 32, 33, 31, 39, 49, 34, 35, 32, 37, 31, 39, + 0, 59, 0, 54, 0, 49, 0, 47, 60, 38, 64, 38, 66, 39, 64, 42, + 33, 31, 33, 32, 33, 32, 32, 33, 31, 40, 30, 40, 29, 41, 29, 42, + 0, 51, 0, 61, 0, 63, 0, 62, 46, 35, 51, 36, 53, 37, 55, 37, + 31, 33, 32, 33, 32, 32, 33, 32, 37, 54, 38, 36, 34, 37, 32, 39, + 0, 58, 0, 53, 0, 49, 0, 46, 59, 37, 63, 38, 64, 40, 62, 42, + 33, 32, 33, 32, 33, 33, 33, 33, 31, 40, 31, 40, 30, 41, 30, 42, + 0, 39, 0, 60, 0, 73, 0, 60, 34, 33, 38, 32, 49, 31, 73, 30, // mode 1, offset 512 + 58, 44, 40, 51, 39, 48, 39, 46, 31, 32, 30, 31, 32, 31, 33, 32, + 0, 43, 0, 35, 0, 33, 0, 31, 87, 35, 78, 54, 47, 86, 17, 114, + 38, 45, 36, 45, 35, 44, 34, 44, 33, 32, 33, 32, 33, 32, 34, 33, + 0, 43, 0, 53, 0, 52, 0, 46, 37, 32, 50, 30, 66, 30, 78, 35, + 53, 70, 42, 72, 39, 70, 37, 68, 30, 31, 31, 30, 32, 30, 34, 30, + 0, 43, 0, 40, 0, 33, 0, 26, 75, 48, 62, 68, 37, 97, 14, 122, + 37, 66, 35, 65, 33, 62, 32, 59, 34, 30, 35, 30, 37, 31, 38, 33, + 0, 40, 0, 45, 0, 41, 0, 37, 39, 33, 54, 32, 70, 35, 73, 44, + 34, 87, 34, 84, 33, 83, 32, 82, 37, 30, 41, 29, 40, 29, 40, 30, + 0, 37, 0, 35, 0, 28, 0, 19, 65, 60, 48, 82, 27, 108, 11, 127, + 31, 81, 30, 79, 28, 76, 27, 70, 41, 29, 43, 29, 45, 30, 46, 32, + 0, 38, 0, 39, 0, 33, 0, 30, 40, 34, 54, 35, 65, 41, 65, 53, + 27, 73, 30, 73, 29, 75, 27, 76, 62, 28, 62, 28, 59, 28, 58, 29, + 0, 29, 0, 27, 0, 19, 0, 9, 53, 72, 35, 95, 19, 117, 16, 127, + 26, 77, 24, 77, 23, 74, 23, 68, 58, 29, 60, 28, 61, 30, 60, 34, + 0, 35, 0, 33, 0, 28, 0, 24, 40, 35, 51, 39, 57, 49, 52, 65, + 29, 44, 29, 49, 28, 53, 26, 56, 89, 30, 86, 30, 83, 30, 82, 30, + 0, 22, 0, 18, 0, 10, 0, 0, 39, 86, 22, 108, 13, 125, 19, 127, + 24, 58, 23, 59, 22, 58, 22, 56, 82, 30, 82, 31, 80, 33, 74, 40, + 0, 33, 0, 29, 0, 24, 0, 19, 40, 36, 46, 44, 45, 58, 37, 78, + 31, 28, 29, 31, 28, 34, 26, 37, 90, 45, 92, 43, 91, 43, 91, 43, + 0, 15, 0, 11, 0, 2, 0, 0, 22, 99, 11, 118, 11, 127, 17, 127, + 25, 38, 24, 39, 23, 41, 23, 43, 91, 42, 90, 44, 85, 48, 75, 55, + 0, 31, 0, 27, 0, 22, 0, 15, 37, 39, 37, 52, 30, 70, 19, 91, + 30, 28, 28, 30, 27, 32, 26, 33, 54, 82, 58, 79, 58, 79, 58, 79, + 0, 10, 0, 5, 0, 0, 0, 0, 8, 111, 2, 125, 9, 127, 13, 127, + 25, 34, 25, 35, 25, 36, 25, 39, 58, 79, 57, 80, 53, 84, 47, 88, + 0, 28, 0, 24, 0, 19, 0, 13, 29, 46, 24, 62, 14, 81, 4, 101, + 28, 39, 27, 41, 25, 43, 24, 44, 2, 123, 1, 125, 0, 126, 0, 127, + 0, 6, 0, 0, 0, 0, 0, 0, 0, 116, 0, 126, 4, 127, 9, 127, + 23, 45, 23, 45, 25, 44, 25, 44, 0, 127, 1, 127, 2, 127, 3, 127, + 0, 30, 0, 63, 0, 98, 0, 75, 32, 32, 26, 34, 26, 34, 61, 30, // mode 2, offset 1024 + 42, 34, 16, 38, 25, 34, 31, 32, 32, 32, 32, 32, 33, 32, 33, 32, + 0, 36, 0, 26, 0, 30, 0, 32, 94, 32, 76, 58, 39, 91, 23, 105, + 30, 33, 30, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, + 0, 34, 0, 66, 0, 97, 0, 71, 30, 33, 24, 34, 28, 34, 65, 30, + 31, 52, 11, 41, 24, 34, 30, 32, 29, 32, 33, 32, 33, 32, 33, 32, + 0, 34, 0, 26, 0, 30, 0, 32, 92, 35, 70, 64, 37, 94, 23, 105, + 30, 33, 29, 34, 30, 33, 31, 33, 32, 32, 32, 32, 32, 31, 33, 31, + 0, 37, 0, 71, 0, 98, 0, 66, 29, 33, 22, 35, 29, 34, 70, 30, + 8, 79, 5, 50, 23, 34, 31, 31, 27, 32, 32, 32, 34, 32, 33, 32, + 0, 31, 0, 26, 0, 30, 0, 31, 92, 38, 66, 68, 34, 97, 22, 106, + 30, 33, 29, 34, 30, 34, 30, 34, 32, 32, 32, 31, 33, 31, 33, 31, + 0, 40, 0, 76, 0, 97, 0, 61, 28, 34, 21, 35, 32, 34, 75, 29, + 0, 76, 0, 55, 21, 37, 30, 32, 46, 28, 35, 32, 33, 33, 32, 32, + 0, 29, 0, 26, 0, 29, 0, 31, 92, 40, 62, 73, 32, 99, 22, 107, + 29, 33, 29, 34, 30, 34, 30, 34, 32, 32, 32, 31, 33, 30, 33, 31, + 0, 42, 0, 80, 0, 94, 0, 55, 27, 34, 20, 35, 36, 32, 80, 29, + 1, 48, 0, 48, 17, 40, 27, 35, 79, 25, 47, 31, 33, 33, 31, 32, + 0, 27, 0, 26, 0, 29, 0, 31, 90, 43, 58, 76, 30, 101, 21, 108, + 28, 34, 29, 33, 29, 34, 29, 35, 32, 31, 33, 30, 34, 30, 34, 30, + 0, 44, 0, 81, 0, 90, 0, 51, 26, 34, 21, 35, 41, 31, 82, 29, + 6, 30, 0, 41, 14, 41, 24, 37, 80, 40, 52, 35, 35, 33, 32, 32, + 0, 27, 0, 26, 0, 29, 0, 31, 87, 47, 54, 79, 29, 102, 21, 108, + 27, 35, 29, 34, 28, 34, 28, 35, 32, 31, 33, 30, 33, 30, 33, 31, + 0, 47, 0, 80, 0, 84, 0, 49, 26, 34, 24, 34, 45, 31, 81, 31, + 7, 34, 0, 41, 12, 40, 22, 37, 44, 75, 41, 50, 36, 36, 33, 32, + 0, 28, 0, 28, 0, 29, 0, 31, 81, 51, 51, 81, 30, 101, 22, 107, + 26, 35, 28, 34, 28, 35, 28, 35, 33, 31, 33, 30, 33, 31, 33, 32, + 0, 48, 0, 75, 0, 77, 0, 49, 27, 34, 27, 34, 47, 33, 75, 36, + 10, 40, 3, 42, 12, 40, 21, 37, 16, 97, 26, 66, 32, 43, 33, 35, + 0, 32, 0, 30, 0, 30, 0, 31, 72, 55, 49, 81, 32, 98, 24, 104, + 25, 36, 27, 35, 28, 35, 28, 35, 33, 32, 33, 31, 32, 32, 32, 33, + 0, 36, 0, 74, 0, 92, 0, 53, 29, 33, 20, 35, 35, 32, 80, 26, // mode 3, offset 1536 + 43, 47, 19, 47, 29, 31, 33, 28, 29, 31, 34, 32, 40, 34, 36, 37, + 0, 24, 0, 25, 0, 32, 0, 34, 91, 41, 57, 74, 28, 99, 20, 105, + 31, 31, 31, 32, 32, 32, 33, 32, 31, 38, 30, 37, 29, 36, 30, 35, + 0, 50, 0, 75, 0, 64, 0, 31, 26, 34, 28, 33, 58, 29, 85, 37, + 33, 74, 23, 46, 30, 26, 31, 27, 30, 31, 47, 33, 46, 40, 33, 44, + 0, 22, 0, 29, 0, 33, 0, 34, 67, 64, 35, 93, 20, 105, 19, 106, + 30, 31, 31, 32, 32, 33, 33, 32, 28, 42, 27, 40, 27, 37, 29, 36, + 0, 51, 0, 61, 0, 40, 0, 22, 29, 33, 42, 31, 70, 34, 72, 54, + 25, 72, 30, 31, 32, 24, 30, 31, 51, 30, 60, 39, 41, 50, 27, 50, + 0, 25, 0, 32, 0, 34, 0, 34, 44, 83, 23, 102, 18, 107, 19, 105, + 30, 33, 32, 33, 32, 33, 33, 32, 25, 44, 26, 40, 28, 37, 30, 35, + 0, 45, 0, 43, 0, 27, 0, 22, 35, 32, 53, 33, 67, 45, 53, 72, + 30, 39, 35, 24, 32, 29, 30, 33, 79, 33, 53, 55, 27, 61, 22, 52, + 0, 28, 0, 32, 0, 34, 0, 34, 31, 95, 20, 105, 18, 107, 20, 105, + 31, 33, 32, 33, 32, 32, 33, 31, 25, 43, 27, 38, 29, 36, 31, 35, + 0, 38, 0, 31, 0, 22, 0, 25, 40, 32, 55, 39, 57, 60, 39, 86, + 35, 23, 34, 29, 31, 35, 31, 35, 72, 54, 32, 73, 18, 64, 22, 49, + 0, 30, 0, 33, 0, 34, 0, 34, 24, 101, 19, 106, 18, 107, 20, 104, + 32, 33, 32, 32, 33, 31, 33, 31, 27, 40, 30, 36, 31, 35, 32, 34, + 0, 33, 0, 26, 0, 23, 0, 27, 42, 35, 51, 50, 46, 74, 32, 93, + 34, 28, 33, 34, 31, 35, 32, 34, 39, 82, 18, 80, 20, 59, 26, 44, + 0, 31, 0, 33, 0, 34, 0, 35, 22, 103, 19, 106, 19, 106, 21, 103, + 32, 32, 33, 31, 33, 31, 34, 31, 30, 37, 31, 35, 32, 34, 32, 34, + 0, 29, 0, 24, 0, 24, 0, 28, 41, 41, 44, 62, 37, 83, 28, 97, + 33, 34, 34, 35, 34, 33, 33, 32, 20, 92, 18, 73, 25, 52, 30, 40, + 0, 32, 0, 34, 0, 35, 0, 35, 23, 103, 20, 105, 20, 104, 22, 102, + 33, 31, 34, 30, 34, 30, 34, 30, 32, 36, 33, 34, 33, 33, 33, 34, + 0, 27, 0, 26, 0, 27, 0, 30, 38, 51, 37, 71, 33, 87, 28, 96, + 34, 34, 35, 34, 35, 32, 34, 31, 20, 86, 24, 64, 30, 47, 32, 39, + 0, 32, 0, 34, 0, 35, 0, 34, 24, 100, 23, 101, 23, 101, 24, 99, + 35, 30, 34, 30, 34, 30, 35, 30, 32, 36, 33, 34, 32, 34, 33, 34, + 0, 39, 0, 72, 0, 100, 0, 75, 30, 31, 21, 32, 23, 32, 63, 24, // mode 4, offset 2048 + 67, 33, 43, 39, 35, 39, 32, 38, 34, 31, 33, 31, 34, 31, 34, 32, + 0, 32, 0, 22, 0, 31, 0, 35, 98, 26, 77, 55, 37, 90, 22, 100, + 29, 37, 29, 36, 31, 35, 33, 33, 35, 32, 35, 31, 35, 32, 36, 33, + 0, 47, 0, 71, 0, 86, 0, 65, 29, 32, 24, 32, 31, 30, 63, 25, + 74, 54, 60, 50, 46, 48, 34, 46, 32, 31, 36, 30, 37, 30, 39, 30, + 0, 33, 0, 26, 0, 33, 0, 37, 85, 32, 64, 60, 33, 87, 23, 93, + 28, 43, 27, 39, 29, 35, 32, 33, 40, 30, 41, 30, 41, 31, 41, 32, + 0, 41, 0, 55, 0, 62, 0, 53, 32, 32, 31, 32, 37, 31, 55, 31, + 45, 84, 50, 70, 45, 61, 36, 55, 32, 32, 40, 30, 45, 29, 48, 29, + 0, 38, 0, 34, 0, 38, 0, 40, 63, 40, 49, 60, 30, 78, 24, 83, + 29, 48, 27, 43, 28, 38, 30, 36, 50, 28, 51, 29, 50, 31, 48, 33, + 0, 35, 0, 39, 0, 41, 0, 41, 33, 33, 35, 33, 39, 34, 43, 37, + 29, 75, 34, 68, 36, 61, 33, 54, 58, 29, 59, 29, 62, 29, 64, 28, + 0, 41, 0, 42, 0, 42, 0, 42, 43, 45, 36, 56, 30, 65, 28, 68, + 30, 48, 27, 44, 27, 41, 28, 37, 65, 29, 63, 30, 60, 33, 56, 36, + 0, 33, 0, 31, 0, 31, 0, 35, 34, 33, 36, 34, 37, 35, 35, 39, + 31, 42, 31, 44, 32, 43, 32, 40, 88, 30, 84, 31, 83, 31, 82, 31, + 0, 40, 0, 44, 0, 44, 0, 43, 32, 44, 30, 48, 30, 52, 30, 55, + 31, 38, 30, 37, 28, 37, 29, 35, 81, 31, 78, 33, 72, 36, 66, 40, + 0, 32, 0, 30, 0, 30, 0, 33, 33, 33, 34, 34, 34, 36, 32, 38, + 34, 25, 33, 25, 34, 25, 34, 25, 85, 48, 88, 44, 90, 41, 90, 40, + 0, 38, 0, 42, 0, 43, 0, 42, 29, 41, 29, 41, 30, 42, 31, 45, + 34, 26, 33, 27, 31, 28, 31, 30, 88, 40, 85, 41, 80, 43, 72, 47, + 0, 32, 0, 31, 0, 32, 0, 34, 33, 33, 32, 34, 32, 35, 31, 36, + 33, 26, 35, 20, 36, 17, 36, 17, 54, 79, 68, 68, 76, 62, 79, 59, + 0, 37, 0, 39, 0, 41, 0, 40, 29, 37, 29, 37, 30, 37, 31, 40, + 36, 18, 35, 20, 34, 22, 32, 26, 78, 58, 77, 58, 74, 58, 68, 59, + 0, 33, 0, 34, 0, 34, 0, 35, 31, 34, 30, 34, 31, 34, 31, 34, + 33, 29, 35, 23, 36, 20, 36, 18, 31, 98, 45, 88, 54, 82, 59, 78, + 0, 36, 0, 38, 0, 39, 0, 39, 31, 34, 30, 34, 31, 35, 31, 37, + 37, 19, 36, 20, 35, 22, 34, 24, 60, 76, 61, 74, 60, 73, 59, 71, + 0, 30, 0, 47, 0, 81, 0, 85, 33, 32, 30, 31, 28, 32, 46, 29, // mode 5, offset 2560 + 55, 32, 29, 36, 28, 34, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 0, 54, 0, 30, 0, 30, 0, 37, 82, 26, 90, 38, 56, 73, 21, 102, + 32, 32, 31, 32, 31, 33, 32, 32, 33, 32, 33, 32, 32, 32, 32, 32, + 0, 33, 0, 38, 0, 63, 0, 82, 32, 31, 32, 31, 30, 31, 37, 30, + 68, 39, 43, 34, 29, 34, 29, 33, 31, 31, 33, 31, 32, 32, 32, 32, + 0, 71, 0, 44, 0, 33, 0, 37, 63, 27, 86, 30, 72, 55, 37, 86, + 31, 32, 30, 33, 30, 32, 31, 32, 33, 32, 33, 32, 32, 31, 33, 31, + 0, 34, 0, 36, 0, 51, 0, 75, 33, 32, 33, 31, 30, 31, 31, 31, + 60, 61, 56, 38, 38, 33, 30, 33, 29, 32, 32, 31, 33, 32, 33, 32, + 0, 80, 0, 60, 0, 41, 0, 38, 47, 29, 73, 27, 78, 41, 53, 68, + 30, 32, 30, 33, 30, 33, 30, 32, 33, 31, 33, 31, 32, 31, 33, 31, + 0, 33, 0, 35, 0, 43, 0, 64, 33, 32, 33, 31, 32, 31, 30, 31, + 43, 77, 55, 54, 46, 39, 35, 34, 35, 30, 29, 32, 31, 32, 33, 32, + 0, 79, 0, 73, 0, 54, 0, 43, 37, 30, 57, 28, 73, 33, 64, 52, + 31, 32, 30, 32, 30, 32, 30, 32, 33, 31, 33, 31, 33, 31, 33, 31, + 0, 33, 0, 34, 0, 38, 0, 54, 33, 32, 33, 31, 33, 31, 31, 31, + 34, 68, 45, 70, 48, 52, 40, 39, 58, 28, 33, 31, 29, 32, 31, 32, + 0, 73, 0, 77, 0, 65, 0, 51, 32, 31, 45, 29, 63, 30, 66, 42, + 34, 34, 31, 32, 31, 31, 30, 32, 33, 31, 32, 32, 33, 31, 33, 31, + 0, 33, 0, 34, 0, 36, 0, 47, 32, 32, 33, 31, 33, 30, 31, 31, + 34, 44, 38, 66, 44, 62, 43, 48, 81, 31, 52, 28, 34, 31, 30, 32, + 0, 64, 0, 75, 0, 71, 0, 59, 31, 31, 38, 30, 53, 30, 61, 37, + 38, 38, 33, 34, 31, 32, 30, 32, 32, 32, 32, 32, 33, 32, 33, 32, + 0, 33, 0, 34, 0, 36, 0, 43, 32, 31, 33, 31, 33, 31, 32, 31, + 35, 31, 37, 49, 41, 60, 43, 54, 71, 54, 70, 33, 48, 30, 35, 31, + 0, 56, 0, 68, 0, 70, 0, 63, 31, 31, 35, 30, 45, 30, 55, 35, + 40, 44, 36, 37, 33, 34, 31, 33, 32, 32, 32, 32, 33, 32, 33, 32, + 0, 33, 0, 34, 0, 36, 0, 41, 32, 31, 32, 31, 33, 31, 33, 31, + 33, 34, 36, 38, 39, 50, 41, 53, 36, 87, 62, 52, 57, 36, 43, 33, + 0, 50, 0, 59, 0, 65, 0, 62, 33, 31, 35, 31, 42, 31, 49, 35, + 41, 48, 37, 41, 35, 36, 33, 34, 36, 32, 34, 32, 33, 32, 34, 33, +}; \ No newline at end of file diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c index 324a2b41..54a67ef6 100644 --- a/src/strategies/generic/intra-generic.c +++ b/src/strategies/generic/intra-generic.c @@ -36,7 +36,7 @@ #include "cu.h" #include "intra.h" -#include "mip_data.h" +#include "strategies/generic/mip_data_generic.h" #include "uvg266.h" #include "strategyselector.h" #include "uvg_math.h" diff --git a/src/mip_data.h b/src/strategies/generic/mip_data_generic.h similarity index 55% rename from src/mip_data.h rename to src/strategies/generic/mip_data_generic.h index 79a2a3f9..957badcc 100644 --- a/src/mip_data.h +++ b/src/strategies/generic/mip_data_generic.h @@ -39,6 +39,7 @@ /** \file MipData.h \brief weight and bias data for matrix-based intra prediction (MIP) */ +#pragma once #define MIP_SHIFT_MATRIX 6 #define MIP_OFFSET_MATRIX 32 @@ -882,562 +883,3 @@ ALIGNED(32) static const uint8_t uvg_mip_matrix_16x16[6][64][8] = } }; -// MIP weight tables for AVX2. - -// This is the same table as used in generic version, but 16-bit. -static ALIGNED(32) const uint16_t uvg_mip_sid0_weights[16][16][4] = -{ - { - { 32, 30, 90, 28}, - { 32, 32, 72, 28}, - { 34, 77, 53, 30}, - { 51, 124, 36, 37}, - { 31, 31, 95, 37}, - { 33, 31, 70, 50}, - { 52, 80, 25, 60}, - { 78, 107, 1, 65}, - { 31, 29, 37, 95}, - { 38, 34, 19, 101}, - { 73, 85, 0, 81}, - { 92, 99, 0, 65}, - { 34, 29, 14, 111}, - { 48, 48, 7, 100}, - { 80, 91, 0, 74}, - { 89, 97, 0, 64} - }, - { - { 31, 23, 34, 29}, - { 31, 43, 34, 31}, - { 30, 95, 34, 32}, - { 29, 100, 35, 33}, - { 31, 23, 34, 29}, - { 31, 43, 34, 31}, - { 30, 95, 34, 32}, - { 29, 99, 35, 33}, - { 31, 24, 35, 29}, - { 31, 44, 34, 31}, - { 30, 95, 35, 32}, - { 29, 99, 35, 33}, - { 31, 24, 35, 30}, - { 31, 44, 35, 31}, - { 30, 95, 35, 32}, - { 29, 99, 35, 33} - }, - { - { 32, 32, 36, 58}, - { 32, 29, 26, 66}, - { 36, 37, 23, 61}, - { 79, 84, 3, 37}, - { 32, 32, 30, 69}, - { 33, 29, 24, 71}, - { 44, 16, 21, 70}, - { 96, 18, 0, 57}, - { 32, 31, 24, 74}, - { 33, 30, 23, 71}, - { 36, 24, 24, 71}, - { 59, 9, 16, 68}, - { 32, 32, 23, 75}, - { 33, 30, 24, 70}, - { 32, 30, 25, 71}, - { 36, 26, 25, 70} - }, - { - { 32, 33, 34, 32}, - { 32, 30, 22, 38}, - { 29, 46, 25, 38}, - { 53, 123, 28, 22}, - { 32, 33, 30, 37}, - { 32, 30, 21, 38}, - { 32, 40, 24, 38}, - { 64, 116, 26, 17}, - { 32, 32, 23, 49}, - { 32, 30, 21, 39}, - { 34, 39, 24, 37}, - { 72, 109, 23, 16}, - { 33, 31, 17, 60}, - { 32, 31, 21, 39}, - { 35, 41, 24, 37}, - { 72, 106, 22, 18} - }, - { - { 34, 25, 89, 20}, - { 38, 32, 47, 24}, - { 40, 86, 29, 27}, - { 38, 98, 32, 29}, - { 34, 31, 94, 40}, - { 44, 25, 83, 27}, - { 54, 72, 43, 16}, - { 47, 94, 33, 22}, - { 33, 31, 36, 94}, - { 43, 23, 51, 76}, - { 62, 55, 64, 25}, - { 57, 89, 38, 15}, - { 32, 32, 28, 101}, - { 38, 26, 33, 94}, - { 55, 38, 68, 47}, - { 59, 80, 52, 16} - }, - { - { 28, 30, 68, 29}, - { 23, 48, 23, 48}, - { 39, 98, 16, 42}, - { 84, 86, 20, 17}, - { 25, 31, 52, 74}, - { 38, 68, 5, 70}, - { 95, 78, 7, 21}, - { 127, 54, 12, 0}, - { 30, 47, 14, 107}, - { 79, 76, 0, 53}, - { 127, 59, 7, 1}, - { 127, 51, 9, 0}, - { 50, 71, 1, 96}, - { 109, 69, 7, 25}, - { 127, 56, 9, 0}, - { 123, 53, 13, 0} - }, - { - { 40, 20, 72, 18}, - { 48, 29, 44, 18}, - { 53, 81, 35, 18}, - { 48, 96, 33, 22}, - { 45, 23, 79, 49}, - { 61, 21, 56, 49}, - { 72, 52, 32, 48}, - { 65, 69, 20, 50}, - { 41, 27, 29, 96}, - { 49, 22, 28, 94}, - { 52, 22, 28, 93}, - { 49, 27, 27, 92}, - { 37, 29, 26, 98}, - { 39, 28, 28, 97}, - { 38, 28, 30, 97}, - { 38, 29, 30, 95} - }, - { - { 33, 27, 43, 27}, - { 32, 29, 31, 31}, - { 31, 73, 33, 31}, - { 35, 104, 34, 28}, - { 32, 30, 63, 22}, - { 33, 26, 33, 29}, - { 33, 57, 33, 30}, - { 37, 100, 35, 27}, - { 32, 31, 85, 25}, - { 34, 25, 39, 25}, - { 35, 39, 32, 28}, - { 40, 91, 35, 25}, - { 32, 30, 77, 50}, - { 34, 26, 54, 22}, - { 37, 31, 34, 27}, - { 45, 75, 34, 23} - }, - { - { 34, 25, 77, 19}, - { 36, 34, 56, 24}, - { 41, 83, 39, 30}, - { 47, 96, 28, 35}, - { 34, 31, 70, 65}, - { 38, 29, 53, 77}, - { 43, 36, 37, 83}, - { 48, 39, 28, 83}, - { 33, 31, 31, 98}, - { 33, 31, 30, 99}, - { 34, 30, 31, 98}, - { 36, 29, 31, 96}, - { 32, 32, 30, 97}, - { 32, 32, 31, 96}, - { 31, 33, 33, 96}, - { 32, 33, 34, 94} - }, - { - { 30, 30, 93, 19}, - { 31, 59, 67, 34}, - { 31, 79, 36, 59}, - { 30, 67, 17, 79}, - { 30, 38, 68, 69}, - { 29, 40, 43, 91}, - { 26, 35, 32, 101}, - { 23, 32, 30, 101}, - { 26, 34, 30, 101}, - { 23, 33, 30, 102}, - { 20, 32, 31, 102}, - { 18, 33, 32, 102}, - { 23, 33, 31, 100}, - { 20, 34, 32, 100}, - { 18, 35, 33, 100}, - { 18, 35, 33, 100} - }, - { - { 31, 54, 90, 26}, - { 32, 60, 53, 61}, - { 34, 49, 37, 84}, - { 34, 39, 35, 89}, - { 35, 38, 41, 88}, - { 35, 35, 32, 96}, - { 35, 31, 33, 96}, - { 35, 32, 35, 94}, - { 34, 34, 30, 97}, - { 35, 32, 33, 95}, - { 35, 32, 34, 94}, - { 35, 34, 34, 93}, - { 34, 34, 34, 93}, - { 35, 34, 34, 93}, - { 35, 34, 34, 92}, - { 36, 34, 35, 91} - }, - { - { 32, 29, 54, 24}, - { 31, 32, 34, 29}, - { 31, 43, 34, 29}, - { 32, 67, 36, 28}, - { 31, 34, 69, 37}, - { 31, 35, 46, 33}, - { 30, 35, 39, 33}, - { 30, 42, 39, 36}, - { 31, 35, 39, 88}, - { 30, 38, 41, 84}, - { 30, 39, 40, 81}, - { 39, 46, 38, 78}, - { 31, 36, 34, 96}, - { 34, 38, 37, 93}, - { 55, 42, 38, 82}, - { 89, 53, 38, 65} - }, - { - { 32, 33, 43, 29}, - { 32, 30, 29, 33}, - { 31, 47, 31, 33}, - { 33, 100, 31, 31}, - { 32, 33, 74, 25}, - { 32, 32, 34, 31}, - { 32, 33, 30, 33}, - { 32, 68, 30, 32}, - { 32, 31, 91, 40}, - { 32, 32, 58, 26}, - { 31, 31, 30, 32}, - { 31, 42, 30, 33}, - { 32, 31, 49, 85}, - { 32, 31, 83, 35}, - { 31, 33, 48, 29}, - { 31, 36, 32, 33} - }, - { - { 31, 29, 81, 35}, - { 32, 28, 34, 50}, - { 31, 75, 16, 43}, - { 34, 103, 29, 32}, - { 32, 32, 53, 78}, - { 31, 28, 36, 88}, - { 30, 52, 18, 73}, - { 52, 88, 17, 35}, - { 32, 32, 35, 94}, - { 30, 31, 35, 95}, - { 36, 29, 31, 92}, - { 100, 43, 16, 40}, - { 32, 32, 35, 93}, - { 30, 32, 38, 93}, - { 55, 18, 37, 83}, - { 127, 0, 30, 40} - }, - { - { 31, 22, 47, 30}, - { 31, 48, 25, 34}, - { 30, 95, 31, 32}, - { 32, 103, 33, 32}, - { 30, 24, 57, 31}, - { 30, 47, 26, 34}, - { 31, 95, 31, 32}, - { 43, 97, 35, 25}, - { 29, 26, 44, 63}, - { 37, 38, 24, 47}, - { 74, 63, 28, 20}, - { 110, 58, 34, 3}, - { 46, 22, 5, 108}, - { 93, 5, 9, 77}, - { 127, 0, 17, 52}, - { 127, 0, 15, 50} - }, - { - { 32, 27, 68, 24}, - { 35, 23, 35, 28}, - { 35, 64, 29, 29}, - { 37, 104, 33, 28}, - { 32, 32, 91, 40}, - { 36, 23, 67, 36}, - { 49, 23, 39, 28}, - { 60, 67, 30, 20}, - { 32, 32, 36, 95}, - { 35, 29, 38, 93}, - { 50, 16, 30, 84}, - { 72, 16, 15, 65}, - { 32, 32, 27, 100}, - { 33, 32, 29, 100}, - { 37, 29, 30, 98}, - { 48, 21, 29, 90} - } -}; - -// Weight vectors for MIP size_id 1. -static ALIGNED(32) const uint16_t uvg_mip_sid1_weights[] = { - 30, 63, 30, 60, 29, 45, 30, 39, 46, 37, 66, 38, 74, 42, 62, 58, // mode 0, offset 0 - 25, 33, 32, 31, 32, 32, 32, 33, 33, 34, 32, 33, 32, 33, 32, 33, - 30, 66, 29, 54, 28, 48, 28, 41, 55, 39, 69, 40, 71, 43, 72, 46, - 32, 30, 33, 31, 32, 33, 32, 34, 30, 36, 31, 33, 32, 33, 32, 33, - 30, 66, 29, 55, 27, 46, 27, 42, 56, 40, 69, 39, 72, 43, 69, 48, - 32, 33, 33, 33, 33, 33, 32, 34, 28, 33, 30, 32, 32, 33, 32, 33, - 30, 63, 29, 56, 27, 47, 27, 42, 55, 40, 66, 40, 69, 44, 65, 50, - 32, 33, 33, 33, 33, 33, 32, 34, 35, 30, 33, 30, 33, 32, 32, 33, - 32, 33, 33, 56, 33, 77, 33, 37, 30, 31, 28, 30, 52, 26, 80, 41, // mode 1, offset 128 - 74, 30, 41, 29, 29, 34, 31, 34, 31, 32, 32, 32, 30, 32, 30, 32, - 32, 32, 33, 31, 33, 47, 33, 61, 33, 31, 31, 30, 28, 29, 44, 28, - 59, 76, 78, 40, 53, 27, 34, 32, 28, 31, 28, 32, 31, 31, 31, 31, - 32, 31, 32, 31, 33, 27, 33, 33, 34, 30, 34, 29, 34, 29, 34, 30, - 26, 64, 45, 86, 73, 55, 62, 33, 76, 27, 36, 29, 25, 32, 30, 31, - 32, 31, 32, 31, 32, 30, 33, 28, 34, 30, 35, 29, 36, 29, 37, 30, - 30, 29, 27, 53, 40, 80, 58, 60, 58, 74, 77, 35, 44, 31, 31, 33, - 32, 51, 32, 95, 32, 27, 32, 34, 27, 32, 42, 29, 99, 34, 21, 104, // mode 2, offset 256 - 27, 50, 29, 42, 31, 41, 31, 42, 29, 32, 30, 32, 29, 32, 30, 32, - 32, 45, 32, 77, 32, 38, 32, 30, 30, 32, 38, 30, 78, 33, 30, 87, - 9, 88, 9, 76, 14, 67, 20, 59, 40, 30, 38, 30, 37, 30, 38, 31, - 33, 37, 34, 44, 36, 39, 37, 31, 32, 32, 34, 31, 45, 31, 31, 54, - 27, 18, 25, 17, 24, 15, 25, 14, 106, 34, 108, 31, 108, 30, 101, 32, - 36, 33, 39, 32, 44, 33, 47, 30, 32, 30, 32, 29, 31, 27, 31, 32, - 29, 37, 27, 37, 25, 37, 25, 34, 13, 110, 15, 108, 16, 106, 19, 102, - 32, 48, 32, 33, 32, 29, 33, 33, 35, 35, 59, 40, 47, 65, 31, 81, // mode 3, offset 384 - 47, 68, 27, 71, 24, 62, 26, 50, 31, 31, 33, 30, 37, 30, 42, 32, - 32, 30, 32, 20, 33, 30, 36, 34, 40, 38, 46, 50, 29, 66, 27, 69, - 30, 70, 26, 55, 25, 41, 26, 31, 55, 31, 64, 31, 72, 33, 67, 39, - 33, 28, 36, 27, 43, 30, 51, 27, 36, 40, 33, 50, 26, 57, 28, 55, - 30, 26, 31, 20, 28, 17, 22, 23, 85, 47, 79, 53, 67, 62, 49, 70, - 38, 29, 51, 31, 69, 23, 77, 13, 32, 39, 28, 43, 30, 40, 35, 38, - 28, 30, 24, 31, 15, 38, 8, 43, 22, 104, 17, 102, 10, 95, 8, 90, - 32, 38, 32, 40, 32, 37, 33, 34, 32, 33, 37, 32, 46, 35, 30, 62, // mode 4, offset 512 -101, 40, 100, 36, 94, 33, 81, 35, 29, 32, 30, 32, 30, 31, 30, 31, - 32, 32, 32, 31, 33, 33, 33, 32, 33, 32, 33, 33, 33, 33, 34, 36, - 22, 102, 26, 104, 31, 103, 37, 94, 39, 29, 34, 28, 32, 28, 33, 28, - 32, 33, 32, 34, 33, 33, 33, 33, 32, 32, 33, 33, 34, 33, 33, 36, - 34, 24, 33, 30, 31, 37, 30, 46, 99, 36, 98, 32, 95, 29, 85, 31, - 32, 33, 32, 34, 32, 33, 33, 33, 32, 33, 33, 33, 34, 34, 32, 37, - 30, 34, 31, 32, 31, 29, 32, 30, 23, 104, 30, 98, 39, 91, 47, 82, - 32, 52, 33, 19, 33, 30, 34, 35, 48, 31, 62, 50, 20, 74, 23, 56, // mode 5, offset 640 - 38, 76, 25, 50, 29, 29, 31, 25, 26, 32, 51, 31, 54, 51, 41, 76, - 33, 25, 35, 28, 37, 35, 38, 32, 38, 39, 25, 47, 22, 38, 33, 29, - 28, 39, 31, 23, 31, 27, 30, 31, 83, 35, 57, 74, 30, 101, 27, 103, - 34, 32, 38, 33, 40, 32, 40, 32, 27, 37, 28, 32, 33, 27, 34, 27, - 32, 25, 30, 31, 29, 33, 28, 33, 41, 92, 18, 111, 18, 111, 23, 105, - 35, 32, 38, 31, 40, 32, 40, 32, 30, 33, 33, 30, 33, 29, 33, 30, - 31, 33, 29, 33, 29, 34, 29, 34, 20, 107, 21, 106, 22, 105, 24, 101, - 32, 28, 33, 30, 33, 60, 33, 63, 31, 33, 28, 33, 26, 33, 44, 36, // mode 6, offset 768 - 92, 33, 71, 26, 47, 28, 37, 31, 30, 31, 32, 30, 33, 30, 33, 30, - 33, 30, 33, 28, 33, 30, 33, 38, 31, 33, 29, 34, 26, 33, 29, 32, - 43, 90, 71, 71, 86, 45, 74, 32, 33, 29, 26, 30, 28, 30, 33, 29, - 33, 32, 34, 31, 34, 31, 33, 32, 30, 32, 29, 33, 29, 33, 28, 34, - 29, 41, 26, 71, 37, 88, 55, 75, 95, 27, 73, 22, 46, 25, 36, 28, - 34, 31, 35, 32, 34, 33, 34, 34, 30, 32, 28, 33, 28, 33, 28, 34, - 33, 27, 33, 23, 30, 35, 33, 53, 43, 89, 77, 59, 91, 37, 74, 31, - 33, 49, 33, 71, 32, 23, 31, 33, 26, 32, 72, 24, 70, 68, 21, 106, // mode 7, offset 896 - 26, 52, 30, 32, 32, 32, 33, 32, 28, 31, 34, 31, 32, 32, 32, 33, - 34, 47, 34, 44, 32, 27, 30, 33, 32, 29, 89, 28, 46, 89, 20, 107, - 5, 86, 28, 37, 33, 31, 33, 33, 44, 26, 33, 30, 31, 32, 32, 33, - 35, 39, 34, 27, 31, 31, 29, 32, 42, 27, 87, 43, 32, 100, 22, 106, - 26, 24, 30, 34, 32, 33, 33, 33, 92, 35, 38, 31, 30, 32, 32, 33, - 35, 29, 34, 24, 31, 33, 29, 33, 47, 32, 69, 60, 31, 99, 25, 103, - 32, 32, 34, 33, 32, 33, 33, 33, 17, 100, 28, 44, 32, 31, 32, 35, -}; - - -// Weight vectors for MIP size_id 2. -static ALIGNED(32) const uint16_t uvg_mip_sid2_weights[] = { - 0, 42, 0, 71, 0, 77, 0, 64, 37, 33, 39, 34, 46, 35, 60, 35, // mode 0, offset 0 - 27, 44, 24, 36, 33, 30, 33, 31, 33, 35, 35, 36, 34, 36, 32, 36, - 0, 49, 0, 42, 0, 40, 0, 38, 71, 38, 66, 50, 52, 67, 43, 75, - 32, 32, 33, 31, 33, 31, 33, 32, 31, 36, 32, 36, 32, 35, 32, 35, - 0, 56, 0, 70, 0, 65, 0, 59, 40, 33, 49, 34, 57, 36, 60, 39, - 26, 43, 30, 28, 34, 28, 33, 30, 38, 36, 38, 38, 33, 39, 31, 38, - 0, 55, 0, 51, 0, 46, 0, 42, 60, 43, 61, 47, 62, 51, 60, 55, - 33, 30, 33, 30, 34, 30, 33, 31, 31, 38, 32, 37, 32, 37, 32, 37, - 0, 60, 0, 68, 0, 62, 0, 58, 42, 34, 52, 35, 58, 37, 59, 41, - 30, 37, 35, 22, 34, 28, 33, 30, 43, 38, 37, 40, 31, 40, 30, 39, - 0, 56, 0, 53, 0, 49, 0, 45, 59, 44, 60, 45, 65, 45, 64, 47, - 34, 30, 33, 30, 33, 30, 33, 31, 31, 38, 31, 38, 31, 38, 32, 38, - 0, 59, 0, 66, 0, 61, 0, 59, 44, 35, 53, 36, 58, 38, 57, 41, - 31, 34, 35, 25, 34, 29, 33, 30, 43, 41, 31, 43, 30, 40, 31, 39, - 0, 57, 0, 54, 0, 51, 0, 48, 58, 43, 61, 43, 64, 43, 64, 45, - 33, 30, 33, 31, 33, 31, 33, 32, 31, 39, 31, 39, 31, 39, 31, 39, - 0, 57, 0, 65, 0, 63, 0, 61, 45, 35, 54, 37, 56, 38, 56, 41, - 30, 35, 33, 33, 34, 30, 34, 30, 40, 44, 24, 44, 29, 39, 32, 39, - 0, 58, 0, 54, 0, 51, 0, 48, 58, 42, 62, 41, 65, 42, 63, 43, - 33, 31, 33, 31, 33, 31, 33, 32, 31, 39, 31, 39, 31, 39, 31, 39, - 0, 55, 0, 65, 0, 65, 0, 63, 46, 35, 53, 37, 54, 38, 55, 39, - 30, 36, 32, 36, 33, 31, 33, 30, 38, 47, 26, 40, 30, 38, 32, 38, - 0, 59, 0, 54, 0, 49, 0, 48, 58, 40, 64, 40, 66, 40, 64, 42, - 33, 31, 33, 31, 32, 32, 32, 32, 31, 39, 30, 40, 30, 41, 30, 41, - 0, 54, 0, 64, 0, 65, 0, 63, 46, 35, 52, 36, 53, 37, 55, 38, - 30, 34, 32, 34, 33, 32, 33, 31, 39, 49, 34, 35, 32, 37, 31, 39, - 0, 59, 0, 54, 0, 49, 0, 47, 60, 38, 64, 38, 66, 39, 64, 42, - 33, 31, 33, 32, 33, 32, 32, 33, 31, 40, 30, 40, 29, 41, 29, 42, - 0, 51, 0, 61, 0, 63, 0, 62, 46, 35, 51, 36, 53, 37, 55, 37, - 31, 33, 32, 33, 32, 32, 33, 32, 37, 54, 38, 36, 34, 37, 32, 39, - 0, 58, 0, 53, 0, 49, 0, 46, 59, 37, 63, 38, 64, 40, 62, 42, - 33, 32, 33, 32, 33, 33, 33, 33, 31, 40, 31, 40, 30, 41, 30, 42, - 0, 39, 0, 60, 0, 73, 0, 60, 34, 33, 38, 32, 49, 31, 73, 30, // mode 1, offset 512 - 58, 44, 40, 51, 39, 48, 39, 46, 31, 32, 30, 31, 32, 31, 33, 32, - 0, 43, 0, 35, 0, 33, 0, 31, 87, 35, 78, 54, 47, 86, 17, 114, - 38, 45, 36, 45, 35, 44, 34, 44, 33, 32, 33, 32, 33, 32, 34, 33, - 0, 43, 0, 53, 0, 52, 0, 46, 37, 32, 50, 30, 66, 30, 78, 35, - 53, 70, 42, 72, 39, 70, 37, 68, 30, 31, 31, 30, 32, 30, 34, 30, - 0, 43, 0, 40, 0, 33, 0, 26, 75, 48, 62, 68, 37, 97, 14, 122, - 37, 66, 35, 65, 33, 62, 32, 59, 34, 30, 35, 30, 37, 31, 38, 33, - 0, 40, 0, 45, 0, 41, 0, 37, 39, 33, 54, 32, 70, 35, 73, 44, - 34, 87, 34, 84, 33, 83, 32, 82, 37, 30, 41, 29, 40, 29, 40, 30, - 0, 37, 0, 35, 0, 28, 0, 19, 65, 60, 48, 82, 27, 108, 11, 127, - 31, 81, 30, 79, 28, 76, 27, 70, 41, 29, 43, 29, 45, 30, 46, 32, - 0, 38, 0, 39, 0, 33, 0, 30, 40, 34, 54, 35, 65, 41, 65, 53, - 27, 73, 30, 73, 29, 75, 27, 76, 62, 28, 62, 28, 59, 28, 58, 29, - 0, 29, 0, 27, 0, 19, 0, 9, 53, 72, 35, 95, 19, 117, 16, 127, - 26, 77, 24, 77, 23, 74, 23, 68, 58, 29, 60, 28, 61, 30, 60, 34, - 0, 35, 0, 33, 0, 28, 0, 24, 40, 35, 51, 39, 57, 49, 52, 65, - 29, 44, 29, 49, 28, 53, 26, 56, 89, 30, 86, 30, 83, 30, 82, 30, - 0, 22, 0, 18, 0, 10, 0, 0, 39, 86, 22, 108, 13, 125, 19, 127, - 24, 58, 23, 59, 22, 58, 22, 56, 82, 30, 82, 31, 80, 33, 74, 40, - 0, 33, 0, 29, 0, 24, 0, 19, 40, 36, 46, 44, 45, 58, 37, 78, - 31, 28, 29, 31, 28, 34, 26, 37, 90, 45, 92, 43, 91, 43, 91, 43, - 0, 15, 0, 11, 0, 2, 0, 0, 22, 99, 11, 118, 11, 127, 17, 127, - 25, 38, 24, 39, 23, 41, 23, 43, 91, 42, 90, 44, 85, 48, 75, 55, - 0, 31, 0, 27, 0, 22, 0, 15, 37, 39, 37, 52, 30, 70, 19, 91, - 30, 28, 28, 30, 27, 32, 26, 33, 54, 82, 58, 79, 58, 79, 58, 79, - 0, 10, 0, 5, 0, 0, 0, 0, 8, 111, 2, 125, 9, 127, 13, 127, - 25, 34, 25, 35, 25, 36, 25, 39, 58, 79, 57, 80, 53, 84, 47, 88, - 0, 28, 0, 24, 0, 19, 0, 13, 29, 46, 24, 62, 14, 81, 4, 101, - 28, 39, 27, 41, 25, 43, 24, 44, 2, 123, 1, 125, 0, 126, 0, 127, - 0, 6, 0, 0, 0, 0, 0, 0, 0, 116, 0, 126, 4, 127, 9, 127, - 23, 45, 23, 45, 25, 44, 25, 44, 0, 127, 1, 127, 2, 127, 3, 127, - 0, 30, 0, 63, 0, 98, 0, 75, 32, 32, 26, 34, 26, 34, 61, 30, // mode 2, offset 1024 - 42, 34, 16, 38, 25, 34, 31, 32, 32, 32, 32, 32, 33, 32, 33, 32, - 0, 36, 0, 26, 0, 30, 0, 32, 94, 32, 76, 58, 39, 91, 23, 105, - 30, 33, 30, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, - 0, 34, 0, 66, 0, 97, 0, 71, 30, 33, 24, 34, 28, 34, 65, 30, - 31, 52, 11, 41, 24, 34, 30, 32, 29, 32, 33, 32, 33, 32, 33, 32, - 0, 34, 0, 26, 0, 30, 0, 32, 92, 35, 70, 64, 37, 94, 23, 105, - 30, 33, 29, 34, 30, 33, 31, 33, 32, 32, 32, 32, 32, 31, 33, 31, - 0, 37, 0, 71, 0, 98, 0, 66, 29, 33, 22, 35, 29, 34, 70, 30, - 8, 79, 5, 50, 23, 34, 31, 31, 27, 32, 32, 32, 34, 32, 33, 32, - 0, 31, 0, 26, 0, 30, 0, 31, 92, 38, 66, 68, 34, 97, 22, 106, - 30, 33, 29, 34, 30, 34, 30, 34, 32, 32, 32, 31, 33, 31, 33, 31, - 0, 40, 0, 76, 0, 97, 0, 61, 28, 34, 21, 35, 32, 34, 75, 29, - 0, 76, 0, 55, 21, 37, 30, 32, 46, 28, 35, 32, 33, 33, 32, 32, - 0, 29, 0, 26, 0, 29, 0, 31, 92, 40, 62, 73, 32, 99, 22, 107, - 29, 33, 29, 34, 30, 34, 30, 34, 32, 32, 32, 31, 33, 30, 33, 31, - 0, 42, 0, 80, 0, 94, 0, 55, 27, 34, 20, 35, 36, 32, 80, 29, - 1, 48, 0, 48, 17, 40, 27, 35, 79, 25, 47, 31, 33, 33, 31, 32, - 0, 27, 0, 26, 0, 29, 0, 31, 90, 43, 58, 76, 30, 101, 21, 108, - 28, 34, 29, 33, 29, 34, 29, 35, 32, 31, 33, 30, 34, 30, 34, 30, - 0, 44, 0, 81, 0, 90, 0, 51, 26, 34, 21, 35, 41, 31, 82, 29, - 6, 30, 0, 41, 14, 41, 24, 37, 80, 40, 52, 35, 35, 33, 32, 32, - 0, 27, 0, 26, 0, 29, 0, 31, 87, 47, 54, 79, 29, 102, 21, 108, - 27, 35, 29, 34, 28, 34, 28, 35, 32, 31, 33, 30, 33, 30, 33, 31, - 0, 47, 0, 80, 0, 84, 0, 49, 26, 34, 24, 34, 45, 31, 81, 31, - 7, 34, 0, 41, 12, 40, 22, 37, 44, 75, 41, 50, 36, 36, 33, 32, - 0, 28, 0, 28, 0, 29, 0, 31, 81, 51, 51, 81, 30, 101, 22, 107, - 26, 35, 28, 34, 28, 35, 28, 35, 33, 31, 33, 30, 33, 31, 33, 32, - 0, 48, 0, 75, 0, 77, 0, 49, 27, 34, 27, 34, 47, 33, 75, 36, - 10, 40, 3, 42, 12, 40, 21, 37, 16, 97, 26, 66, 32, 43, 33, 35, - 0, 32, 0, 30, 0, 30, 0, 31, 72, 55, 49, 81, 32, 98, 24, 104, - 25, 36, 27, 35, 28, 35, 28, 35, 33, 32, 33, 31, 32, 32, 32, 33, - 0, 36, 0, 74, 0, 92, 0, 53, 29, 33, 20, 35, 35, 32, 80, 26, // mode 3, offset 1536 - 43, 47, 19, 47, 29, 31, 33, 28, 29, 31, 34, 32, 40, 34, 36, 37, - 0, 24, 0, 25, 0, 32, 0, 34, 91, 41, 57, 74, 28, 99, 20, 105, - 31, 31, 31, 32, 32, 32, 33, 32, 31, 38, 30, 37, 29, 36, 30, 35, - 0, 50, 0, 75, 0, 64, 0, 31, 26, 34, 28, 33, 58, 29, 85, 37, - 33, 74, 23, 46, 30, 26, 31, 27, 30, 31, 47, 33, 46, 40, 33, 44, - 0, 22, 0, 29, 0, 33, 0, 34, 67, 64, 35, 93, 20, 105, 19, 106, - 30, 31, 31, 32, 32, 33, 33, 32, 28, 42, 27, 40, 27, 37, 29, 36, - 0, 51, 0, 61, 0, 40, 0, 22, 29, 33, 42, 31, 70, 34, 72, 54, - 25, 72, 30, 31, 32, 24, 30, 31, 51, 30, 60, 39, 41, 50, 27, 50, - 0, 25, 0, 32, 0, 34, 0, 34, 44, 83, 23, 102, 18, 107, 19, 105, - 30, 33, 32, 33, 32, 33, 33, 32, 25, 44, 26, 40, 28, 37, 30, 35, - 0, 45, 0, 43, 0, 27, 0, 22, 35, 32, 53, 33, 67, 45, 53, 72, - 30, 39, 35, 24, 32, 29, 30, 33, 79, 33, 53, 55, 27, 61, 22, 52, - 0, 28, 0, 32, 0, 34, 0, 34, 31, 95, 20, 105, 18, 107, 20, 105, - 31, 33, 32, 33, 32, 32, 33, 31, 25, 43, 27, 38, 29, 36, 31, 35, - 0, 38, 0, 31, 0, 22, 0, 25, 40, 32, 55, 39, 57, 60, 39, 86, - 35, 23, 34, 29, 31, 35, 31, 35, 72, 54, 32, 73, 18, 64, 22, 49, - 0, 30, 0, 33, 0, 34, 0, 34, 24, 101, 19, 106, 18, 107, 20, 104, - 32, 33, 32, 32, 33, 31, 33, 31, 27, 40, 30, 36, 31, 35, 32, 34, - 0, 33, 0, 26, 0, 23, 0, 27, 42, 35, 51, 50, 46, 74, 32, 93, - 34, 28, 33, 34, 31, 35, 32, 34, 39, 82, 18, 80, 20, 59, 26, 44, - 0, 31, 0, 33, 0, 34, 0, 35, 22, 103, 19, 106, 19, 106, 21, 103, - 32, 32, 33, 31, 33, 31, 34, 31, 30, 37, 31, 35, 32, 34, 32, 34, - 0, 29, 0, 24, 0, 24, 0, 28, 41, 41, 44, 62, 37, 83, 28, 97, - 33, 34, 34, 35, 34, 33, 33, 32, 20, 92, 18, 73, 25, 52, 30, 40, - 0, 32, 0, 34, 0, 35, 0, 35, 23, 103, 20, 105, 20, 104, 22, 102, - 33, 31, 34, 30, 34, 30, 34, 30, 32, 36, 33, 34, 33, 33, 33, 34, - 0, 27, 0, 26, 0, 27, 0, 30, 38, 51, 37, 71, 33, 87, 28, 96, - 34, 34, 35, 34, 35, 32, 34, 31, 20, 86, 24, 64, 30, 47, 32, 39, - 0, 32, 0, 34, 0, 35, 0, 34, 24, 100, 23, 101, 23, 101, 24, 99, - 35, 30, 34, 30, 34, 30, 35, 30, 32, 36, 33, 34, 32, 34, 33, 34, - 0, 39, 0, 72, 0, 100, 0, 75, 30, 31, 21, 32, 23, 32, 63, 24, // mode 4, offset 2048 - 67, 33, 43, 39, 35, 39, 32, 38, 34, 31, 33, 31, 34, 31, 34, 32, - 0, 32, 0, 22, 0, 31, 0, 35, 98, 26, 77, 55, 37, 90, 22, 100, - 29, 37, 29, 36, 31, 35, 33, 33, 35, 32, 35, 31, 35, 32, 36, 33, - 0, 47, 0, 71, 0, 86, 0, 65, 29, 32, 24, 32, 31, 30, 63, 25, - 74, 54, 60, 50, 46, 48, 34, 46, 32, 31, 36, 30, 37, 30, 39, 30, - 0, 33, 0, 26, 0, 33, 0, 37, 85, 32, 64, 60, 33, 87, 23, 93, - 28, 43, 27, 39, 29, 35, 32, 33, 40, 30, 41, 30, 41, 31, 41, 32, - 0, 41, 0, 55, 0, 62, 0, 53, 32, 32, 31, 32, 37, 31, 55, 31, - 45, 84, 50, 70, 45, 61, 36, 55, 32, 32, 40, 30, 45, 29, 48, 29, - 0, 38, 0, 34, 0, 38, 0, 40, 63, 40, 49, 60, 30, 78, 24, 83, - 29, 48, 27, 43, 28, 38, 30, 36, 50, 28, 51, 29, 50, 31, 48, 33, - 0, 35, 0, 39, 0, 41, 0, 41, 33, 33, 35, 33, 39, 34, 43, 37, - 29, 75, 34, 68, 36, 61, 33, 54, 58, 29, 59, 29, 62, 29, 64, 28, - 0, 41, 0, 42, 0, 42, 0, 42, 43, 45, 36, 56, 30, 65, 28, 68, - 30, 48, 27, 44, 27, 41, 28, 37, 65, 29, 63, 30, 60, 33, 56, 36, - 0, 33, 0, 31, 0, 31, 0, 35, 34, 33, 36, 34, 37, 35, 35, 39, - 31, 42, 31, 44, 32, 43, 32, 40, 88, 30, 84, 31, 83, 31, 82, 31, - 0, 40, 0, 44, 0, 44, 0, 43, 32, 44, 30, 48, 30, 52, 30, 55, - 31, 38, 30, 37, 28, 37, 29, 35, 81, 31, 78, 33, 72, 36, 66, 40, - 0, 32, 0, 30, 0, 30, 0, 33, 33, 33, 34, 34, 34, 36, 32, 38, - 34, 25, 33, 25, 34, 25, 34, 25, 85, 48, 88, 44, 90, 41, 90, 40, - 0, 38, 0, 42, 0, 43, 0, 42, 29, 41, 29, 41, 30, 42, 31, 45, - 34, 26, 33, 27, 31, 28, 31, 30, 88, 40, 85, 41, 80, 43, 72, 47, - 0, 32, 0, 31, 0, 32, 0, 34, 33, 33, 32, 34, 32, 35, 31, 36, - 33, 26, 35, 20, 36, 17, 36, 17, 54, 79, 68, 68, 76, 62, 79, 59, - 0, 37, 0, 39, 0, 41, 0, 40, 29, 37, 29, 37, 30, 37, 31, 40, - 36, 18, 35, 20, 34, 22, 32, 26, 78, 58, 77, 58, 74, 58, 68, 59, - 0, 33, 0, 34, 0, 34, 0, 35, 31, 34, 30, 34, 31, 34, 31, 34, - 33, 29, 35, 23, 36, 20, 36, 18, 31, 98, 45, 88, 54, 82, 59, 78, - 0, 36, 0, 38, 0, 39, 0, 39, 31, 34, 30, 34, 31, 35, 31, 37, - 37, 19, 36, 20, 35, 22, 34, 24, 60, 76, 61, 74, 60, 73, 59, 71, - 0, 30, 0, 47, 0, 81, 0, 85, 33, 32, 30, 31, 28, 32, 46, 29, // mode 5, offset 2560 - 55, 32, 29, 36, 28, 34, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, - 0, 54, 0, 30, 0, 30, 0, 37, 82, 26, 90, 38, 56, 73, 21, 102, - 32, 32, 31, 32, 31, 33, 32, 32, 33, 32, 33, 32, 32, 32, 32, 32, - 0, 33, 0, 38, 0, 63, 0, 82, 32, 31, 32, 31, 30, 31, 37, 30, - 68, 39, 43, 34, 29, 34, 29, 33, 31, 31, 33, 31, 32, 32, 32, 32, - 0, 71, 0, 44, 0, 33, 0, 37, 63, 27, 86, 30, 72, 55, 37, 86, - 31, 32, 30, 33, 30, 32, 31, 32, 33, 32, 33, 32, 32, 31, 33, 31, - 0, 34, 0, 36, 0, 51, 0, 75, 33, 32, 33, 31, 30, 31, 31, 31, - 60, 61, 56, 38, 38, 33, 30, 33, 29, 32, 32, 31, 33, 32, 33, 32, - 0, 80, 0, 60, 0, 41, 0, 38, 47, 29, 73, 27, 78, 41, 53, 68, - 30, 32, 30, 33, 30, 33, 30, 32, 33, 31, 33, 31, 32, 31, 33, 31, - 0, 33, 0, 35, 0, 43, 0, 64, 33, 32, 33, 31, 32, 31, 30, 31, - 43, 77, 55, 54, 46, 39, 35, 34, 35, 30, 29, 32, 31, 32, 33, 32, - 0, 79, 0, 73, 0, 54, 0, 43, 37, 30, 57, 28, 73, 33, 64, 52, - 31, 32, 30, 32, 30, 32, 30, 32, 33, 31, 33, 31, 33, 31, 33, 31, - 0, 33, 0, 34, 0, 38, 0, 54, 33, 32, 33, 31, 33, 31, 31, 31, - 34, 68, 45, 70, 48, 52, 40, 39, 58, 28, 33, 31, 29, 32, 31, 32, - 0, 73, 0, 77, 0, 65, 0, 51, 32, 31, 45, 29, 63, 30, 66, 42, - 34, 34, 31, 32, 31, 31, 30, 32, 33, 31, 32, 32, 33, 31, 33, 31, - 0, 33, 0, 34, 0, 36, 0, 47, 32, 32, 33, 31, 33, 30, 31, 31, - 34, 44, 38, 66, 44, 62, 43, 48, 81, 31, 52, 28, 34, 31, 30, 32, - 0, 64, 0, 75, 0, 71, 0, 59, 31, 31, 38, 30, 53, 30, 61, 37, - 38, 38, 33, 34, 31, 32, 30, 32, 32, 32, 32, 32, 33, 32, 33, 32, - 0, 33, 0, 34, 0, 36, 0, 43, 32, 31, 33, 31, 33, 31, 32, 31, - 35, 31, 37, 49, 41, 60, 43, 54, 71, 54, 70, 33, 48, 30, 35, 31, - 0, 56, 0, 68, 0, 70, 0, 63, 31, 31, 35, 30, 45, 30, 55, 35, - 40, 44, 36, 37, 33, 34, 31, 33, 32, 32, 32, 32, 33, 32, 33, 32, - 0, 33, 0, 34, 0, 36, 0, 41, 32, 31, 32, 31, 33, 31, 33, 31, - 33, 34, 36, 38, 39, 50, 41, 53, 36, 87, 62, 52, 57, 36, 43, 33, - 0, 50, 0, 59, 0, 65, 0, 62, 33, 31, 35, 31, 42, 31, 49, 35, - 41, 48, 37, 41, 35, 36, 33, 34, 36, 32, 34, 32, 33, 32, 34, 33, -}; \ No newline at end of file From e5d05b9cee2646642148abfe43ebb1c7694378d5 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 10:59:31 +0300 Subject: [PATCH 167/237] remove unused functions --- src/strategies/avx2/intra-avx2.c | 903 +------------------------------ 1 file changed, 2 insertions(+), 901 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index debaf417..b029f4f3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -55,433 +55,6 @@ #include "strategies/missing-intel-intrinsics.h" - /** - * \brief Generate angular predictions. - * \param cu_loc CU locationand size data. - * \param intra_mode Angular mode in range 2..34. - * \param channel_type Color channel. - * \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. - * \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. - * \param dst Buffer of size width*width. - * \param multi_ref_idx Reference line index for use with MRL. - */ -static void uvg_angular_pred_avx2_old( - const cu_loc_t* const cu_loc, - const int_fast8_t intra_mode, - const int_fast8_t channel_type, - const uvg_pixel *const in_ref_above, - const uvg_pixel *const in_ref_left, - uvg_pixel *const dst, - const uint8_t multi_ref_idx, - const uint8_t isp_mode, - const int cu_dim) -{ - // ISP_TODO: non-square block implementation, height is passed but not used - const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; - const int log2_width = uvg_g_convert_to_log2[width]; - const int log2_height = uvg_g_convert_to_log2[height]; - - assert((log2_width >= 2 && log2_width <= 6) && (log2_height <= 6)); - assert(intra_mode >= 2 && intra_mode <= 66); - - // TODO: implement handling of MRL - uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; - uint8_t isp = isp_mode; - - __m256i p_shuf_01 = _mm256_setr_epi8( - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, - 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c - ); - - __m256i p_shuf_23 = _mm256_setr_epi8( - 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, - 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, - 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e - ); - - __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a - ); - - __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e - ); - - static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; - static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp - static const int32_t pre_scale[] = { 8, 7, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -2, -3 }; - - static const int16_t cubic_filter[32][4] = - { - { 0, 64, 0, 0 }, - { -1, 63, 2, 0 }, - { -2, 62, 4, 0 }, - { -2, 60, 7, -1 }, - { -2, 58, 10, -2 }, - { -3, 57, 12, -2 }, - { -4, 56, 14, -2 }, - { -4, 55, 15, -2 }, - { -4, 54, 16, -2 }, - { -5, 53, 18, -2 }, - { -6, 52, 20, -2 }, - { -6, 49, 24, -3 }, - { -6, 46, 28, -4 }, - { -5, 44, 29, -4 }, - { -4, 42, 30, -4 }, - { -4, 39, 33, -4 }, - { -4, 36, 36, -4 }, - { -4, 33, 39, -4 }, - { -4, 30, 42, -4 }, - { -4, 29, 44, -5 }, - { -4, 28, 46, -6 }, - { -3, 24, 49, -6 }, - { -2, 20, 52, -6 }, - { -2, 18, 53, -5 }, - { -2, 16, 54, -4 }, - { -2, 15, 55, -4 }, - { -2, 14, 56, -4 }, - { -2, 12, 57, -3 }, - { -2, 10, 58, -2 }, - { -1, 7, 60, -2 }, - { 0, 4, 62, -2 }, - { 0, 2, 63, -1 }, - }; - - // Temporary buffer for modes 11-25. - // It only needs to be big enough to hold indices from -width to width-1. - uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - - int32_t pred_mode = intra_mode; // ToDo: handle WAIP - - // Whether to swap references to always project on the left reference row. - const bool vertical_mode = intra_mode >= 34; - // Modes distance to horizontal or vertical mode. - const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -(pred_mode - 18); - //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode; - - // Sample displacement per column in fractions of 32. - const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; - - // TODO: replace latter width with height - int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]); - - // Pointer for the reference we are interpolating from. - uvg_pixel *ref_main; - // Pointer for the other reference. - const uvg_pixel *ref_side; - - // Set ref_main and ref_side such that, when indexed with 0, they point to - // index 0 in block coordinates. - if (sample_disp < 0) { - memcpy(&temp_main[width], vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); - memcpy(&temp_side[width], vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); - - ref_main = temp_main + width; - ref_side = temp_side + width; - - for (int i = -width; i <= -1; i++) { - ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)]; - } - - - - //const uint32_t index_offset = width + 1; - //const int32_t last_index = width; - //const int_fast32_t most_negative_index = (width * sample_disp) >> 5; - //// Negative sample_disp means, we need to use both references. - - //// TODO: update refs to take into account variating block size and shapes - //// (height is not always equal to width) - //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1; - //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1; - - //// Move the reference pixels to start from the middle to the later half of - //// the tmp_ref, so there is room for negative indices. - //for (int_fast32_t x = -1; x < width; ++x) { - // tmp_ref[x + index_offset] = ref_main[x]; - //} - //// Get a pointer to block index 0 in tmp_ref. - //ref_main = &tmp_ref[index_offset]; - //tmp_ref[index_offset -1] = tmp_ref[index_offset]; - - //// Extend the side reference to the negative indices of main reference. - //int_fast32_t col_sample_disp = 128; // rounding for the ">> 8" - //int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)]; - //// TODO: add 'vertical_mode ? height : width' instead of 'width' - // - //for (int_fast32_t x = -1; x > most_negative_index; x--) { - // col_sample_disp += inv_abs_sample_disp; - // int_fast32_t side_index = col_sample_disp >> 8; - // tmp_ref[x + index_offset - 1] = ref_side[side_index - 1]; - //} - //tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1]; - //tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset]; - } - else { - - memcpy(temp_main, vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel)* (width * 2 + multi_ref_index + 1)); - memcpy(temp_side, vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel)* (width * 2 + multi_ref_index + 1)); - - const int s = 0; - const int max_index = (multi_ref_index << s) + 2; - const int ref_length = width << 1; - const uvg_pixel val = temp_main[ref_length + multi_ref_index]; - memset(temp_main + ref_length + multi_ref_index, val, max_index + 1); - - ref_main = temp_main; - ref_side = temp_side; - //// sample_disp >= 0 means we don't need to refer to negative indices, - //// which means we can just use the references as is. - //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1; - //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1; - - //memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(uvg_pixel)); - //ref_main = &tmp_ref[width]; - //tmp_ref[width-1] = tmp_ref[width]; - //int8_t last_index = 1 + width*2; - //tmp_ref[width + last_index] = tmp_ref[width + last_index - 1]; - } - - // compensate for line offset in reference line buffers - ref_main += multi_ref_index; - ref_side += multi_ref_index; - - static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; - int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; - int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); - - bool use_cubic = true; // Default to cubic filter - if (dist_from_vert_or_hor > filter_threshold) { - if ((abs(sample_disp) & 0x1F) != 0) - { - use_cubic = false; - } - } - // Cubic must be used if ref line != 0 - if (multi_ref_index) { - use_cubic = true; - } - - if (sample_disp != 0) { - // The mode is not horizontal or vertical, we have to do interpolation. - - int_fast32_t delta_pos = sample_disp * multi_ref_index; - int64_t delta_int[4] = { 0 }; - int16_t delta_fract[4] = { 0 }; - for (int_fast32_t y = 0; y + 3 < width; y += 4) { - - for (int yy = 0; yy < 4; ++yy) { - delta_pos += sample_disp; - delta_int[yy] = delta_pos >> 5; - delta_fract[yy] = delta_pos & (32 - 1); - } - - if ((abs(sample_disp) & 0x1F) != 0) { - - // Luma Channel - if (channel_type == 0) { - - int16_t f[4][4] = { { 0 } }; - if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[0]], 8); - memcpy(f[1], cubic_filter[delta_fract[1]], 8); - memcpy(f[2], cubic_filter[delta_fract[2]], 8); - memcpy(f[3], cubic_filter[delta_fract[3]], 8); - } - else { - for(int yy = 0; yy < 4; ++yy) { - const int16_t offset = (delta_fract[yy] >> 1); - f[yy][0] = 16 - offset; - f[yy][1] = 32 - offset; - f[yy][2] = 16 + offset; - f[yy][3] = offset; - } - } - - // Do 4-tap intra interpolation filtering - uvg_pixel *p = (uvg_pixel*)ref_main; - __m256i vidx = _mm256_loadu_si256((__m256i *)delta_int); - __m256i all_weights = _mm256_loadu_si256((__m256i *)f); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { - - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); - - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); - - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); - } - } - else { - - // Do linear filtering - for (int yy = 0; yy < 4; ++yy) { - for (int_fast32_t x = 0; x < width; ++x) { - uvg_pixel ref1 = ref_main[x + delta_int[yy] + 1]; - uvg_pixel ref2 = ref_main[x + delta_int[yy] + 2]; - dst[(y + yy) * width + x] = ref1 + ((delta_fract[yy] * (ref2 - ref1) + 16) >> 5); - } - } - } - } - else { - // Just copy the integer samples - for (int yy = 0; yy < 4; ++yy) { - uvg_pixel *dst_row = dst + (y + yy) * width; - uvg_pixel *ref_row = ref_main + delta_int[yy] + 1; - for (int_fast32_t x = 0; x + 3 < width; x += 4) { - memcpy(dst_row + x, ref_row + x, 4 * sizeof(dst[0])); - } - } - } - - - // PDPC - bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); - if (pred_mode > 1 && pred_mode < 67) { - if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. - PDPC_filter = false; - } - else if (mode_disp > 0) { - PDPC_filter &= (scale >= 0); - } - } - if(PDPC_filter) { - - int16_t wL[4]; - int16_t left[4][4]; - - int limit = MIN(3 << scale, width); - - for (int x = 0; x < limit; x += 4) { - - for (int xx = 0; xx < 4; ++xx) { - int inv_angle_sum = 256 + (x + xx + 1) * modedisp2invsampledisp[abs(mode_disp)]; - wL[xx] = 32 >> (2 * (x + xx) >> scale); - - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1]; - } - } - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - uint64_t quad; - memcpy(&quad, wL, sizeof(quad)); - __m256i vwL = _mm256_set1_epi64x(quad); - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vwL, accu); - accu = _mm256_add_epi16(accu, _mm256_set1_epi16(32)); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // Need to mask remainder samples on the last iteration when limit % 4 != 0 - int rem_bits = 8 * (limit - x); - __m128i ones = _mm_set1_epi32(0xFF); - __m128i vmask = _mm_slli_epi32(ones, rem_bits); - - // 0 selects filtered, 1 vdst (unchanged) - vdst = _mm_blendv_epi8(filtered, vdst, vmask); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vdst, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vdst, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vdst, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vdst, 3); - } - } - } - } - else { - // Mode is horizontal or vertical, just copy the pixels. - - // TODO: update outer loop to use height instead of width - for (int_fast32_t y = 0; y < width; ++y) { - for (int_fast32_t x = 0; x < width; ++x) { - dst[y * width + x] = ref_main[x + 1]; - } - if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - int scale = (log2_width + log2_width - 2) >> 2; - const uvg_pixel top_left = ref_main[0]; - const uvg_pixel left = ref_side[1 + y]; - for (int i = 0; i < MIN(3 << scale, width); i++) { - const int wL = 32 >> (2 * i >> scale); - const uvg_pixel val = dst[y * width + i]; - dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); - } - } - } - } - - // Flip the block if this is was a horizontal mode. - if (!vertical_mode) { - - const __m128i vtranspose_mask =_mm_setr_epi8( - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 - ); - - const __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - const __m128i vidx = _mm_slli_epi32(vseq, log2_width); - - // Transpose as 4x4 subblocks - for (int_fast32_t y = 0; y + 3 < width; y += 4) { - for (int_fast32_t x = y; x + 3 < width; x += 4) { - - __m128i vtemp4x4 = _mm_i32gather_epi32((const int32_t*)(dst + x * width + y), vidx, 1); - __m128i v4x4 = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); - vtemp4x4 = _mm_shuffle_epi8(vtemp4x4, vtranspose_mask); - v4x4 = _mm_shuffle_epi8(v4x4, vtranspose_mask); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vtemp4x4, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vtemp4x4, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vtemp4x4, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vtemp4x4, 3); - - *(uint32_t*)(dst + (x + 0) * width + y) = _mm_extract_epi32(v4x4, 0); - *(uint32_t*)(dst + (x + 1) * width + y) = _mm_extract_epi32(v4x4, 1); - *(uint32_t*)(dst + (x + 2) * width + y) = _mm_extract_epi32(v4x4, 2); - *(uint32_t*)(dst + (x + 3) * width + y) = _mm_extract_epi32(v4x4, 3); - } - } - } -} - - static const int16_t cubic_filter[32][4] = { { 0, 64, 0, 0 }, @@ -1782,62 +1355,6 @@ static void angular_pred_non_fractional_angle_pxl_copy_hor_avx2(uvg_pixel* dst, } -static void angular_pdpc_ver_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) -{ - int16_t wL[4]; - int16_t left[4][4]; - - int limit = MIN(3 << scale, width); - const int log2_width = uvg_g_convert_to_log2[width]; - - - for (int_fast32_t y = 0; y + 3 < height; y += 4) { - for (int x = 0; x < limit; x += 4) { - - for (int xx = 0; xx < 4; ++xx) { - int inv_angle_sum = 256 + (x + xx + 1) * inv_sample_disp; - wL[xx] = 32 >> (2 * (x + xx) >> scale); - - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1]; - } - } - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - uint64_t quad; - memcpy(&quad, wL, sizeof(quad)); - __m256i vwL = _mm256_set1_epi64x(quad); - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vwL, accu); - accu = _mm256_add_epi16(accu, _mm256_set1_epi16(32)); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // Need to mask remainder samples on the last iteration when limit % 4 != 0 - int rem_bits = 8 * (limit - x); - __m128i ones = _mm_set1_epi32(0xFF); - __m128i vmask = _mm_slli_epi32(ones, rem_bits); - - // 0 selects filtered, 1 vdst (unchanged) - vdst = _mm_blendv_epi8(filtered, vdst, vmask); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(vdst, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(vdst, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(vdst, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(vdst, 3); - } - } -} - - static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -2584,69 +2101,6 @@ static void angular_pdpc_ver_h16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } - -static void angular_pdpc_hor_old_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) -{ - int16_t wT[4]; - int16_t ref_top[4][4]; - - int limit = MIN(3 << scale, height); - const int log2_width = uvg_g_convert_to_log2[width]; - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m256i v32s = _mm256_set1_epi16(32); - __m256i vwT_shuffle = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, - 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, - 6, 7, 6, 7, 6, 7, 6, 7); - for (int y = 0; y < limit; y += 4) { - for (int x = 0; x < width; x += 4) { - - for (int yy = 0; yy < 4; ++yy) { - int inv_angle_sum = 256 + (y + yy + 1) * inv_sample_disp; - - // Set weight to zero if limit reached. - // This removes the need to blend results with unmodified values in the end. - wT[yy] = y + yy < limit ? 32 >> (2 * (y + yy) >> scale) : 0; - for (int xx = 0; xx < 4; ++xx) { - ref_top[yy][xx] = ref_side[(x + xx) + (inv_angle_sum >> 9) + 1]; - } - } - - __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width + x), vidx, 1); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m256i vtop = _mm256_loadu_si256((__m256i*)ref_top); - uint64_t quad; - memcpy(&quad, wT, sizeof(quad)); - __m256i vwT = _mm256_set1_epi64x(quad); - vwT = _mm256_shuffle_epi8(vwT, vwT_shuffle); - __m256i accu = _mm256_sub_epi16(vtop, vpred16); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // Need to mask remainder samples on the last iteration when limit % 4 != 0 - //int rem_bits = 8 * (limit - y); - //__m128i ones = _mm_set1_epi32(0xFF); - //__m128i vmask = _mm_slli_epi32(ones, rem_bits); - - //// 0 selects filtered, 1 vdst (unchanged) - //vpred = _mm_blendv_epi8(filtered, vpred, vmask); - - *(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3); - } - } -} - static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -3692,116 +3146,6 @@ static void uvg_angular_pred_avx2( } -/** - * \brief Generate planar prediction. - * \param cu_loc CU location and size data. - * \param color Color channel. - * \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. - * \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. - * \param dst Buffer of size width*width. - */ -static void uvg_intra_pred_planar_avx2_old( - const cu_loc_t* const cu_loc, - color_t color, - const uint8_t *const ref_top, - const uint8_t *const ref_left, - uint8_t *const dst) -{ - // ISP_TODO: non-square block implementation, height is passed but not used - const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; - const int log2_width = uvg_g_convert_to_log2[width]; - const int log2_height = uvg_g_convert_to_log2[height]; - - assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5)); - - const uint8_t top_right = ref_top[width + 1]; - const uint8_t bottom_left = ref_left[width + 1]; - - if (log2_width > 2) { - - __m128i v_width = _mm_set1_epi16(width); - __m128i v_top_right = _mm_set1_epi16(top_right); - __m128i v_bottom_left = _mm_set1_epi16(bottom_left); - - for (int y = 0; y < width; ++y) { - - __m128i x_plus_1 = _mm_setr_epi16(-7, -6, -5, -4, -3, -2, -1, 0); - __m128i v_ref_left = _mm_set1_epi16(ref_left[y + 1]); - __m128i y_plus_1 = _mm_set1_epi16(y + 1); - - for (int x = 0; x < width; x += 8) { - x_plus_1 = _mm_add_epi16(x_plus_1, _mm_set1_epi16(8)); - __m128i v_ref_top = _mm_loadl_epi64((__m128i*)&(ref_top[x + 1])); - v_ref_top = _mm_cvtepu8_epi16(v_ref_top); - - __m128i hor = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(v_width, x_plus_1), v_ref_left), _mm_mullo_epi16(x_plus_1, v_top_right)); - __m128i ver = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(v_width, y_plus_1), v_ref_top), _mm_mullo_epi16(y_plus_1, v_bottom_left)); - - //dst[y * width + x] = ho - - __m128i chunk = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(ver, hor), v_width), (log2_width + 1)); - chunk = _mm_packus_epi16(chunk, chunk); - _mm_storel_epi64((__m128i*)&(dst[y * width + x]), chunk); - } - } - } else { - // Only if log2_width == 2 <=> width == 4 - assert(width == 4); - const __m128i rl_shufmask = _mm_setr_epi32(0x04040404, 0x05050505, - 0x06060606, 0x07070707); - - const __m128i xp1 = _mm_set1_epi32 (0x04030201); - const __m128i yp1 = _mm_shuffle_epi8(xp1, rl_shufmask); - - const __m128i rdist = _mm_set1_epi32 (0x00010203); - const __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask); - - const __m128i wid16 = _mm_set1_epi16 (width); - const __m128i tr = _mm_set1_epi8 (top_right); - const __m128i bl = _mm_set1_epi8 (bottom_left); - - uint32_t rt14 = *(const uint32_t *)(ref_top + 1); - uint32_t rl14 = *(const uint32_t *)(ref_left + 1); - uint64_t rt14_64 = (uint64_t)rt14; - uint64_t rl14_64 = (uint64_t)rl14; - uint64_t rtl14 = rt14_64 | (rl14_64 << 32); - - __m128i rtl_v = _mm_cvtsi64_si128 (rtl14); - __m128i rt = _mm_broadcastd_epi32(rtl_v); - __m128i rl = _mm_shuffle_epi8 (rtl_v, rl_shufmask); - - __m128i rtrl_l = _mm_unpacklo_epi8 (rt, rl); - __m128i rtrl_h = _mm_unpackhi_epi8 (rt, rl); - - __m128i bdrd_l = _mm_unpacklo_epi8 (bdist, rdist); - __m128i bdrd_h = _mm_unpackhi_epi8 (bdist, rdist); - - __m128i hvs_lo = _mm_maddubs_epi16 (rtrl_l, bdrd_l); - __m128i hvs_hi = _mm_maddubs_epi16 (rtrl_h, bdrd_h); - - __m128i xp1yp1_l = _mm_unpacklo_epi8 (xp1, yp1); - __m128i xp1yp1_h = _mm_unpackhi_epi8 (xp1, yp1); - __m128i trbl_lh = _mm_unpacklo_epi8 (tr, bl); - - __m128i addend_l = _mm_maddubs_epi16 (trbl_lh, xp1yp1_l); - __m128i addend_h = _mm_maddubs_epi16 (trbl_lh, xp1yp1_h); - - addend_l = _mm_add_epi16 (addend_l, wid16); - addend_h = _mm_add_epi16 (addend_h, wid16); - - __m128i sum_l = _mm_add_epi16 (hvs_lo, addend_l); - __m128i sum_h = _mm_add_epi16 (hvs_hi, addend_h); - - // Shift right by log2_width + 1 - __m128i sum_l_t = _mm_srli_epi16 (sum_l, 3); - __m128i sum_h_t = _mm_srli_epi16 (sum_h, 3); - __m128i result = _mm_packus_epi16 (sum_l_t, sum_h_t); - _mm_storeu_si128((__m128i *)dst, result); - } -} - - typedef void (intra_planar_half_func)(const uvg_pixel* ref_main, const uvg_pixel* ref_side, const int line, const int shift, __m256i* dst); // w1 and w2 for planar horizontal do not exist, since intra prediction must be at least of width 4 @@ -4222,7 +3566,7 @@ static intra_planar_half_func* planar_func_table[2][7] = { }; -void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, +static void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, color_t color, const uint8_t* const ref_top, const uint8_t* const ref_left, @@ -4760,47 +4104,6 @@ static void uvg_pdpc_planar_dc_avx2( } } -void uvg_mip_boundary_downsampling_1D_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_src, int src_len, int dst_len) -{ - // Source length can be 4, 8, 16, 32 or 64 - // Destination length can be 2 or 4 - - // Due to the small size of dst_len, not much can be done with AVX2 here - - if (dst_len < src_len) - { - // Create reduced boundary by downsampling - // Maximum down sample factor is 64 / 2 = 32 - uint16_t down_smp_factor = src_len / dst_len; - const int log2_factor = uvg_math_floor_log2(down_smp_factor); - const int rounding_offset = (1 << (log2_factor - 1)); - - uint16_t src_idx = 0; - // This loop is run max 4 times - for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++) { - int sum = 0; - // Sum together up tp 32 sequential source samples - for (int k = 0; k < down_smp_factor; k++) { - sum += ref_src[src_idx++]; - } - reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor; - // I can only see limited optimization potential here. There's a lot of additions, but not too much data. - // For down sample factor 2, a simple horizontal add would do wonders, but it can only handle that specific case. - // There needs to be several versions of this function for different cases, not entirely sure if its worth it. - } - } - else - { - // Copy boundary if no downsampling is needed. If this branch is reached, dst_len must be 4 - memcpy(reduced_dst, ref_src, 4 * sizeof(uvg_pixel)); // Copy as much as dst_len indicates - - /*for (uint16_t i = 0; i < dst_len; ++i) - { - reduced_dst[i] = ref_src[i]; - }*/ - } -} - static INLINE void mip_ref_downsampling_4x4_4to2_avx2(uvg_pixel* reduced_dst, const uvg_pixel* const ref_top, const uvg_pixel* const ref_left) { const uint8_t down_smp_factor = 2; // width / red_bdry_size @@ -4959,66 +4262,6 @@ static INLINE void mip_ref_downsampling_1D_64to4_avx2(uvg_pixel* reduced_dst, co } -// This function is not optimized, do not use in production. It is left here for reference. -void uvg_mip_reduced_pred_avx2(uvg_pixel* const output, - const int16_t* const input, - const uint8_t* matrix, - const bool transpose, - const int red_bdry_size, - const int red_pred_size, - const int size_id, - const int in_offset, - const int in_offset_tr) -{ - // Reduced boundary size is 2 or 4 -> input size is 4 or 8 - const int input_size = 2 * red_bdry_size; - - // Use local buffer for transposed result - uvg_pixel out_buf_transposed[64]; // Max size 8x8, was LCU_WIDTH * LCU_WIDTH - uvg_pixel* out_ptr = transpose ? out_buf_transposed : output; - - int sum = 0; - for (int i = 0; i < input_size; i++) { - sum += input[i]; - } - const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum; - assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four"); - - const uint8_t* weight = matrix; - const int input_offset = transpose ? in_offset_tr : in_offset; - - int pos_res = 0; - - // Reduced prediction size is 4 or 8 - for (int y = 0; y < red_pred_size; y++) { - for (int x = 0; x < red_pred_size; x++) { - // Use 16-bit intermediates - int tmp0 = input[0] * weight[0]; - int tmp1 = input[1] * weight[1]; - int tmp2 = input[2] * weight[2]; - int tmp3 = input[3] * weight[3]; - for (int i = 4; i < input_size; i += 4) { - tmp0 += input[i] * weight[i]; - tmp1 += input[i + 1] * weight[i + 1]; - tmp2 += input[i + 2] * weight[i + 2]; - tmp3 += input[i + 3] * weight[i + 3]; - } - out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); - pos_res++; - weight += input_size; - } - } - - if (transpose) { - for (int y = 0; y < red_pred_size; y++) { - for (int x = 0; x < red_pred_size; x++) { - output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y]; - } - } - } -} - - // Size ID 0 static INLINE void mip_reduced_pred_sid0_avx2(uvg_pixel* const output, const int16_t* const input, @@ -5447,144 +4690,6 @@ void INLINE mip_reduced_pred_sid2_avx2(uvg_pixel* const output, } -void uvg_mip_pred_upsampling_1D_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, - const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, - const uint8_t src_step, const uint8_t src_stride, - const uint8_t dst_step, const uint8_t dst_stride, - const uint8_t boundary_step, - const uint8_t ups_factor) -{ - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); - const int rounding_offset = 1 << (log2_factor - 1); - - uint16_t idx_orth_dim = 0; - const uvg_pixel* src_line = src; - uvg_pixel* dst_line = dst; - const uvg_pixel* boundary_line = boundary + boundary_step - 1; - while (idx_orth_dim < src_size_orth_dim) - { - uint16_t idx_upsample_dim = 0; - const uvg_pixel* before = boundary_line; - const uvg_pixel* behind = src_line; - uvg_pixel* cur_dst = dst_line; - while (idx_upsample_dim < src_size_ups_dim) - { - uint16_t pos = 1; - int scaled_before = (*before) << log2_factor; - int scaled_behind = 0; - while (pos <= ups_factor) - { - scaled_before -= *before; - scaled_behind += *behind; - *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; - - pos++; - cur_dst += dst_step; - } - - idx_upsample_dim++; - before = behind; - behind += src_step; - } - - idx_orth_dim++; - src_line += src_stride; - dst_line += dst_stride; - boundary_line += boundary_step; - } -} - -void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, - const uint8_t red_pred_size, const uint16_t dst_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) -{ - const uint8_t ref_step = ups_ver_factor; // height / red_pred_size - const uint8_t ups_factor = ups_hor_factor; // width / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); - const int rounding_offset = 1 << (log2_factor - 1); - - uint16_t idx_orth_dim = 0; - const uvg_pixel* src_line = src; - uvg_pixel* dst_line = dst; - const uvg_pixel* ref_line = ref + ref_step - 1; - for (int idx_orth_dim = 0; idx_orth_dim < red_pred_size; ++idx_orth_dim) { - uint16_t idx_upsample_dim = 0; - const uvg_pixel* before = ref_line; - const uvg_pixel* behind = src_line; - uvg_pixel* cur_dst = dst_line; - for (int idx_upsample_dim = 0; idx_upsample_dim < red_pred_size; ++idx_upsample_dim) { - uint16_t pos = 1; - int scaled_before = (*before) << log2_factor; - int scaled_behind = 0; - for (int pos = 0; pos < ups_factor; ++pos) { - scaled_before -= *before; - scaled_behind += *behind; - *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; - - cur_dst++; // Destination step is 1 - } - - before = behind; - behind++; // Source step is 1 - } - - src_line += red_pred_size; // Source stride is same as red_pred_size - dst_line += dst_step; // Destination stride is same as ver_src_step, which is width * ups_ver_factor. Can be as high as 512, must be 16-bit - ref_line += ref_step; - } -} - -void uvg_mip_pred_upsampling_1D_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, - const uint8_t src_size_ups_dim, const uint16_t src_size_orth_dim, - const uint16_t src_step, const uint8_t src_stride, - const uint8_t dst_step, const uint8_t dst_stride, - const uint8_t boundary_step, - const uint8_t ups_factor) -{ - const int log2_factor = uvg_math_floor_log2(ups_factor); - assert(ups_factor >= 2 && "Upsampling factor must be at least 2."); - const int rounding_offset = 1 << (log2_factor - 1); - - uint16_t idx_orth_dim = 0; - const uvg_pixel* src_line = src; - uvg_pixel* dst_line = dst; - const uvg_pixel* boundary_line = boundary + boundary_step - 1; - while (idx_orth_dim < src_size_orth_dim) - { - uint16_t idx_upsample_dim = 0; - const uvg_pixel* before = boundary_line; - const uvg_pixel* behind = src_line; - uvg_pixel* cur_dst = dst_line; - while (idx_upsample_dim < src_size_ups_dim) - { - uint16_t pos = 1; - int scaled_before = (*before) << log2_factor; - int scaled_behind = 0; - while (pos <= ups_factor) - { - scaled_before -= *before; - scaled_behind += *behind; - *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor; - - pos++; - cur_dst += dst_step; - } - - idx_upsample_dim++; - before = behind; - behind += src_step; - } - - idx_orth_dim++; - src_line += src_stride; - dst_line += dst_stride; - boundary_line += boundary_step; - } -} - - // 8x8, size id 1 hor upscale params [4, 4, 1, 4, 1, 16, 2, 2] static void mip_upsampling_w8_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { @@ -7848,7 +6953,7 @@ static void mip_upsampling_w64_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg /** \brief Matrix weighted intra prediction. */ -void mip_predict_avx2( +static void mip_predict_avx2( //const encoder_state_t* const state, const uvg_intra_references* const refs, const uint16_t pred_block_width, @@ -7914,7 +7019,6 @@ void mip_predict_avx2( } else { // Horizontal downsampling - // uvg_mip_boundary_downsampling_1D_avx2(top_reduced, ref_samples_top, width, red_bdry_size); switch (width) { case 4: // 4x4 case handled elsewhere. @@ -7931,7 +7035,6 @@ void mip_predict_avx2( } // Vertical downsampling - // uvg_mip_boundary_downsampling_1D_avx2(left_reduced, ref_samples_left, height, red_bdry_size); switch (height) { case 4: // 4x4 case handled elsewhere. @@ -8023,8 +7126,6 @@ void mip_predict_avx2( ver_src = hor_dst; ver_src_step *= ups_ver_factor; - // void uvg_mip_pred_upsampling_1D_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const boundary, const uint8_t red_pred_size, const uint8_t ver_src_step, const uint8_t ups_ver_factor, const uint8_t ups_hor_factor) - switch (width) { // Case 4 does not exist. There is no need for horizontal upsampling when width is 4. case 8: From e3cabd3ff417d5413a478b634d6b07a3672bc582 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 11:33:42 +0300 Subject: [PATCH 168/237] static --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b029f4f3..0aa4aa83 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4361,7 +4361,7 @@ static INLINE void mip_reduced_pred_sid0_avx2(uvg_pixel* const output, } // Size ID 1 -void INLINE mip_reduced_pred_sid1_avx2(uvg_pixel* const output, +static void INLINE mip_reduced_pred_sid1_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, const bool transpose, @@ -4503,7 +4503,7 @@ void INLINE mip_reduced_pred_sid1_avx2(uvg_pixel* const output, } // Size ID 2 -void INLINE mip_reduced_pred_sid2_avx2(uvg_pixel* const output, +static void INLINE mip_reduced_pred_sid2_avx2(uvg_pixel* const output, const int16_t* const input, const uint16_t* matrix, const bool transpose, From 14524a31c2e2366efb7a72430a27186b06677675 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 11:46:51 +0300 Subject: [PATCH 169/237] unaligned --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0aa4aa83..44485149 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2202,7 +2202,7 @@ static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, for (int x = 0; x < width; x += 16) { __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m128i vtop = _mm_load_si128((__m128i*)&ref_side[x + shifted_inv_angle_sum[y] + 1]); + __m128i vtop = _mm_loadu_si128((__m128i*)&ref_side[x + shifted_inv_angle_sum[y] + 1]); __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); __m256i accu = _mm256_sub_epi16(vtop16, vpred16); @@ -3641,7 +3641,7 @@ static void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, __m256i v_tmp = _mm256_packus_epi16(v_res[s + 0], v_res[s + 1]); v_tmp = _mm256_permute4x64_epi64(v_tmp, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_store_si256((__m256i*)&dst[i], v_tmp); + _mm256_storeu_si256((__m256i*)&dst[i], v_tmp); } } } From cc6abd66759d5d4323eb696dd2c7583381bf7854 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 14:42:11 +0300 Subject: [PATCH 170/237] Fix mrl indexing --- src/strategies/avx2/intra-avx2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 44485149..c324f541 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2853,7 +2853,9 @@ static void uvg_angular_pred_avx2( // Set delta table pointers const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 13) * 64 : (81 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); const int16_t* delta_int = wide_angle_mode ? &delta_int_wide_angle_table[table_offset] : &delta_int_table[table_offset]; + delta_int += multi_ref_index; // TODO: This are not necessarily large enough for 64 dimension blocks const int16_t* delta_fract = wide_angle_mode ? &delta_fract_wide_angle_table[table_offset] : &delta_fract_table[table_offset]; + delta_fract += multi_ref_index; // Check if the angle is fractional. If yes, interpolation is needed if ((abs(sample_disp) & 0x1F) != 0) { From 5b71909e474f2d3b366ee16e135f6e0e84573a83 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 15:36:29 +0300 Subject: [PATCH 171/237] unaligned --- src/strategies/avx2/intra-avx2.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c324f541..c2343d30 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4657,10 +4657,6 @@ static void INLINE mip_reduced_pred_sid2_avx2(uvg_pixel* const output, _mm_storeu_si128((__m128i*)out_ptr, vres8); out_ptr += 16; } - - //out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset); - out_ptr += 16; - weight += input_size * 4; } if (transpose) { @@ -4840,7 +4836,7 @@ static void mip_upsampling_w16_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pix before[24] = ref_ptr[ref_step * 3]; __m256i vbefore = _mm256_load_si256((__m256i*)before); - __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + __m256i vbehind = _mm256_loadu_si256((__m256i*)src_ptr); __m256i vavg = _mm256_avg_epu8(vbefore, vbehind); @@ -5074,10 +5070,10 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i vtmp2 = _mm256_unpacklo_epi16(left_temp1, right_temp1); __m256i vtmp3 = _mm256_unpackhi_epi16(left_temp1, right_temp1); - _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 0), vtmp0); - _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 1), vtmp1); - _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 2), vtmp2); - _mm256_store_si256((__m256i*)(dst_ptr + dst_step * 3), vtmp3); + _mm256_storeu_si256((__m256i*)(dst_ptr + dst_step * 0), vtmp0); + _mm256_storeu_si256((__m256i*)(dst_ptr + dst_step * 1), vtmp1); + _mm256_storeu_si256((__m256i*)(dst_ptr + dst_step * 2), vtmp2); + _mm256_storeu_si256((__m256i*)(dst_ptr + dst_step * 3), vtmp3); src_ptr += 32; ref_ptr += ref_step * 4; @@ -5773,7 +5769,7 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix __m128i vbehind6 = _mm_loadu_si128((__m128i*)(src + 192)); __m128i vbehind7 = _mm_loadu_si128((__m128i*)(src + 224)); - __m128i vbefore0 = _mm_load_si128((__m128i*)ref); + __m128i vbefore0 = _mm_loadu_si128((__m128i*)ref); __m128i vbefore1 = vbehind0; __m128i vbefore2 = vbehind1; __m128i vbefore3 = vbehind2; @@ -6135,14 +6131,14 @@ static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; - __m256i vbefore = _mm256_load_si256((__m256i*)ref); + __m256i vbefore = _mm256_loadu_si256((__m256i*)ref); const __m256i zeros = _mm256_setzero_si256(); const __m256i ones = _mm256_set1_epi8(1); const __m256i threes = _mm256_set1_epi8(3); for (int i = 0; i < 8; ++i) { - __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + __m256i vbehind = _mm256_loadu_si256((__m256i*)src_ptr); // Calculate the 3 interpolated lines between before and behind. Top row, middle row and bottom row. __m256i vmiddle = _mm256_avg_epu8(vbefore, vbehind); @@ -6166,9 +6162,9 @@ static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg vbottom = _mm256_sub_epi8(vbottom, sub_amount); // Store results - _mm256_store_si256((__m256i*)(dst_ptr + 0), vtop); - _mm256_store_si256((__m256i*)(dst_ptr + 32), vmiddle); - _mm256_store_si256((__m256i*)(dst_ptr + 64), vbottom); + _mm256_storeu_si256((__m256i*)(dst_ptr + 0), vtop); + _mm256_storeu_si256((__m256i*)(dst_ptr + 32), vmiddle); + _mm256_storeu_si256((__m256i*)(dst_ptr + 64), vbottom); vbefore = vbehind; src_ptr += 128; From 6452589441d38f469c96edf7351ff3bfd871c0f8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 15:50:01 +0300 Subject: [PATCH 172/237] too small variable --- src/strategies/avx2/intra-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c2343d30..3fe48603 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4114,7 +4114,7 @@ static INLINE void mip_ref_downsampling_4x4_4to2_avx2(uvg_pixel* reduced_dst, co const __m128i vrnd = _mm_set1_epi16(rounding_offset); - ALIGNED(16) uint32_t ref[2]; + ALIGNED(16) uint32_t ref[4]; ref[0] = *(uint32_t*)ref_top; ref[1] = *(uint32_t*)ref_left; From 8c500627cd2fd104545cc915ddead0218d9f63b9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 18:33:08 +0300 Subject: [PATCH 173/237] Fix --- src/strategies/avx2/intra-avx2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3fe48603..0e325a07 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4657,6 +4657,7 @@ static void INLINE mip_reduced_pred_sid2_avx2(uvg_pixel* const output, _mm_storeu_si128((__m128i*)out_ptr, vres8); out_ptr += 16; } + weight += input_size * 4; } if (transpose) { From 3b9c450356876ed37473b8fed358a64672fc3c75 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 18:43:12 +0300 Subject: [PATCH 174/237] static --- src/strategies/avx2/intra_avx2_tables.h | 77 ++++++++++--------------- 1 file changed, 31 insertions(+), 46 deletions(-) diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 8852eabc..40da82c9 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3,50 +3,35 @@ #include "global.h" -// Test tables -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m40[] = { - 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, -}; - -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_w4_m30[] = { - 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, - 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, - 0x0a, 0x0b, 0x09, 0x0a, 0x09, 0x0a, 0x08, 0x09, - 0x0b, 0x0c, 0x0a, 0x0b, 0x0a, 0x0b, 0x09, 0x0a -}; - -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_w4_m30_coeff[] = { - 20, 12, 8, 24, 28, 04, 16, 16, 20, 12, 8, 24, 28, 04, 16, 16, -}; // The number of unique 128-bit coefficient vectors for a given prediction mode. Applicable for width 4 chroma linear interpolation. -ALIGNED(32) const int8_t coeff_vector128_num_by_mode[33] = { +static ALIGNED(32) const int8_t coeff_vector128_num_by_mode[33] = { 1, 16, 8, 16, 4, 8, 1, 8, 4, 8, 2, 8, 4, 16, 8, 16, 1, 16, 8, 16, 4, 8, 2, 8, 4, 8, 1, 8, 4, 16, 8, 16, 1 }; -ALIGNED(32) const int8_t coeff_vector128_num_by_mode_wide_angle[14] = { +static ALIGNED(32) const int8_t coeff_vector128_num_by_mode_wide_angle[14] = { 1, 16, 1, 16, 1, 8, 8, 16, 1, 16, 16, 16, 16, 16 }; -ALIGNED(32) const int16_t coeff_table_mode_offsets[33] = { +static ALIGNED(32) const int16_t coeff_table_mode_offsets[33] = { 0, 16, 272, 400, 656, 720, 848, 864, 992, 1056, 1184, 1216, 1344, 1408, 1664, 1792, 2048, 2064, 2320, 2448, 2704, 2768, 2896, 2928, 3056, 3120, 3248, 3264, 3392, 3456, 3712, 3840, 4096 }; -ALIGNED(32) const int16_t mode_to_weight_table_offset_w4_hor[35] = { +static ALIGNED(32) const int16_t mode_to_weight_table_offset_w4_hor[35] = { 0, 0, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512 }; -ALIGNED(32) const int16_t mode_to_shuffle_vector_table_offset_w4_hor[35] = { +static ALIGNED(32) const int16_t mode_to_shuffle_vector_table_offset_w4_hor[35] = { 0, 0, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024 }; // Index with (mode - 2) * 8 + (y >> 2). The given index will point to the correct place in shuffle vector table. -ALIGNED(32) const int16_t intra_chroma_linear_interpolation_w4_ver_shuffle_vector_offset[] = { +static ALIGNED(32) const int16_t intra_chroma_linear_interpolation_w4_ver_shuffle_vector_offset[] = { 0, 0, 0, 0, 0, 0, 0, 0, // Mode 2 0, 0, 32, 0, 0, 64, 0, 0, // Mode 3 0, 64, 32, 0, 0, 64, 32, 0, // Mode 4 @@ -84,7 +69,7 @@ ALIGNED(32) const int16_t intra_chroma_linear_interpolation_w4_ver_shuffle_vecto // Shuffle vectors for w4 vertical. This is indexed based on the shape of delta int table for each mode. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ver[] = { // Shape of the delta int table in sets of four +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ver[] = { // Shape of the delta int table in sets of four 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // [0, 1, 2, 3] 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // [0, 1, 1, 2] 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // [0, 0, 1, 2] @@ -106,7 +91,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ve // NOTE: shuffle vectors for w8, w16, and w32 vertical do not exists as they are not needed. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_hor[] = { 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, // Mode 2 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, @@ -242,7 +227,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w4_ho }; -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_hor[] = { 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 2 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 3 @@ -312,7 +297,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w8_ho }; -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_hor[] = { 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 2 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, @@ -448,7 +433,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w16_h }; -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_hor[] = { 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, // Mode 2 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, 0x0f, 0x10, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, // Mode 3 @@ -519,7 +504,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vectors_w32_h // Chroma linear interpolation filter weights for width 8, vertical modes. These also work for w16 and w32. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 @@ -556,7 +541,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { }; // Chroma linear interpolation filter weights for width 8, vertical wide angle modes. These also work for w16 and w32. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver_wide_angle[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver_wide_angle[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 Offset 0 11, 21, 22, 10, 1, 31, 12, 20, 23, 9, 2, 30, 13, 19, 24, 8, 3, 29, 14, 18, 25, 7, 4, 28, 15, 17, 26, 6, 5, 27, 16, 16, 27, 5, 6, 26, 17, 15, 28, 4, 7, 25, 18, 14, 29, 3, 8, 24, 19, 13, 30, 2, 9, 23, 20, 12, 31, 1, 10, 22, 21, 11, 32, 0, // Mode -11 Offset 64 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -10 Offset 128 @@ -574,7 +559,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver_wide_a }; // Chroma linear interpolation filter weights for width 4, horizontal modes -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 3, 29, 6, 26, 9, 23, 12, 20, // Mode 3 6, 26, 12, 20, 18, 14, 24, 8, 6, 26, 12, 20, 18, 14, 24, 8, // Mode 4 @@ -612,7 +597,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_hor[] = { // Chroma linear interpolation filter weights for width 8, horizontal modes -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, // Mode 3 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, // Mode 4 @@ -650,7 +635,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_hor[] = { // Chroma linear interpolation filter weights for width 16, horizontal modes. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, // Mode 3 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 @@ -688,7 +673,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor[] = { // Chroma linear interpolation filter weights for width 32, horizontal modes. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w32_hor[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w32_hor[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 3, 29, 6, 26, 9, 23, 12, 20, 15, 17, 18, 14, 21, 11, 24, 8, 27, 5, 30, 2, 1, 31, 4, 28, 7, 25, 10, 22, 13, 19, 16, 16, 19, 13, 22, 10, 25, 7, 28, 4, 31, 1, 2, 30, 5, 27, 8, 24, 11, 21, 14, 18, 17, 15, 20, 12, 23, 9, 26, 6, 29, 3, 32, 0, // Mode 3 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, 6, 26, 12, 20, 18, 14, 24, 8, 30, 2, 4, 28, 10, 22, 16, 16, 22, 10, 28, 4, 2, 30, 8, 24, 14, 18, 20, 12, 26, 6, 32, 0, // Mode 4 @@ -726,7 +711,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w32_hor[] = { // Chroma linear interpolation filter weights for width 4, vertical modes. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode 2 Offset 0 3, 29, 3, 29, 3, 29, 3, 29, 6, 26, 6, 26, 6, 26, 6, 26, // Mode 3 Offset 16 9, 23, 9, 23, 9, 23, 9, 23, 12, 20, 12, 20, 12, 20, 12, 20, @@ -988,7 +973,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver[4112] // Chroma linear interpolation filter weights for width 4, wide angle vertical modes. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver_wide_angle[2368] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver_wide_angle[2368] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 Offset 0 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, // Mode -11 Offset 16 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, @@ -1161,7 +1146,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w4_ver_wide_a // NOTE: this table can also be used by horizontal w4 and w8 wide angle functions since their tables are just a subset of this one. // Chroma linear interpolation filter weights for width 4, horizontal wide angle modes. -ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[] = { +static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor_wide_angle[] = { 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, // Mode -12 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, @@ -1278,7 +1263,7 @@ ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w16_hor_wide_ // Weights for intra pdpc w4 horizontal. -ALIGNED(32) const int16_t intra_pdpc_w4_hor_weight[] = { +static ALIGNED(32) const int16_t intra_pdpc_w4_hor_weight[] = { 32, 32, 32, 32, 8, 8, 8, 8, 2, 2, 2, 2, 0, 0, 0, 0, // Scale 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1295,7 +1280,7 @@ ALIGNED(32) const int16_t intra_pdpc_w4_hor_weight[] = { // Weights for intra pdpc w8 horizontal. -ALIGNED(32) const int16_t intra_pdpc_w8_hor_weight[] = { +static ALIGNED(32) const int16_t intra_pdpc_w8_hor_weight[] = { 32, 32, 32, 32, 32, 32, 32, 32, 8, 8, 8, 8, 8, 8, 8, 8, // Scale 0 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1324,7 +1309,7 @@ ALIGNED(32) const int16_t intra_pdpc_w8_hor_weight[] = { // Weights for intra pdpc w4 vertical. -ALIGNED(32) const int16_t intra_pdpc_w4_ver_weight[] = { +static ALIGNED(32) const int16_t intra_pdpc_w4_ver_weight[] = { 32, 8, 2, 0, 32, 8, 2, 0, 32, 8, 2, 0, 32, 8, 2, 0, // Scale 0 32, 16, 8, 4, 32, 16, 8, 4, 32, 16, 8, 4, 32, 16, 8, 4, // Scale 1 32, 32, 16, 16, 32, 32, 16, 16, 32, 32, 16, 16, 32, 32, 16, 16, // Scale 2 @@ -1332,7 +1317,7 @@ ALIGNED(32) const int16_t intra_pdpc_w4_ver_weight[] = { // Weights for intra pdpc w8 vertical. -ALIGNED(32) const int16_t intra_pdpc_w8_ver_weight[] = { +static ALIGNED(32) const int16_t intra_pdpc_w8_ver_weight[] = { 32, 8, 2, 0, 0, 0, 0, 0, 32, 8, 2, 0, 0, 0, 0, 0, // Scale 0 32, 16, 8, 4, 2, 1, 0, 0, 32, 16, 8, 4, 2, 1, 0, 0, // Scale 1 32, 32, 16, 16, 8, 8, 4, 4, 32, 32, 16, 16, 8, 8, 4, 4, // Scale 2 @@ -1340,7 +1325,7 @@ ALIGNED(32) const int16_t intra_pdpc_w8_ver_weight[] = { // Weights for intra pdpc w16 vertical. -ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { +static ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Scale 0 32, 16, 8, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Scale 1 32, 32, 16, 16, 8, 8, 4, 4, 2, 2, 1, 1, 0, 0, 0, 0, // Scale 2 @@ -1349,7 +1334,7 @@ ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { // Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. // Index by y or x based on pdpc direction. -ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { +static ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Mode disp 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, // Mode disp 1 @@ -1420,7 +1405,7 @@ ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { // TODO: prune this table. These is a ton of duplicates. Pruning may introduce some extra logic, but it will save a lot of space and probably speed up memory access. // NOTE: The vectors from this table can be only used up from mode disp 6. The reference samples are too sparse for vectorized shuffle below mode disp 6. // Shuffle vectors for w4 horizontal pdpc. -ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_hor[] = { +static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_hor[] = { 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, // Mode disp 0 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, 0x000, 0x001, 0x002, 0x003, @@ -1937,7 +1922,7 @@ ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_hor[] = { // Shuffle vectors for w4 vertical pdpc. -ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_ver[] = { +static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_ver[] = { 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 0 0x000, 0x020, 0x040, 0x060, 0x001, 0x021, 0x041, 0x061, 0x002, 0x022, 0x042, 0x062, 0x003, 0x023, 0x043, 0x063, // Mode disp 1 0x000, 0x010, 0x020, 0x030, 0x001, 0x011, 0x021, 0x031, 0x002, 0x012, 0x022, 0x032, 0x003, 0x013, 0x023, 0x033, // Mode disp 2 @@ -1975,7 +1960,7 @@ ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_ver[] = { // Shuffle vectors for 8x2 scale 1 vertical pdpc. 0xfff entries are "don't care", those will be zeroed out by zero weights // These are basically same as the 8x2 scale2 vectors, but with added "don't care" entries. This table can be safely removed. -ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale1_ver[] = { +static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale1_ver[] = { 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 0 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0xfff, 0xfff, 0x001, 0x021, 0x041, 0x061, 0x081, 0x0a1, 0xfff, 0xfff, // Mode disp 1 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0xfff, 0xfff, 0x001, 0x011, 0x021, 0x031, 0x041, 0x051, 0xfff, 0xfff, // Mode disp 2 @@ -2012,7 +1997,7 @@ ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale1_ver[] = { // Shuffle vectors for 8x2 scale 2 vertical pdpc. -ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale2_ver[] = { +static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale2_ver[] = { 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 0 -- Unused 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0x0c0, 0x0e0, 0x001, 0x021, 0x041, 0x061, 0x081, 0x0a1, 0x0c1, 0x0e1, // Mode disp 1 * 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x001, 0x011, 0x021, 0x031, 0x041, 0x051, 0x061, 0x071, // Mode disp 2 * @@ -2049,7 +2034,7 @@ ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale2_ver[] = { // Shuffle vectors for w16 scale 2 vertical pdpc. -ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { +static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // Mode disp 0 -- Unused 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0x0c0, 0x0e0, 0x100, 0x120, 0x140, 0x160, 0x180, 0x1a0, 0x1c0, 0x1e0, // Mode disp 1 * 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x080, 0x090, 0x0a0, 0x0b0, 0x0c0, 0x0d0, 0x0e0, 0x0f0, // Mode disp 2 * From 7db32df1bc093ec165429ced62a3ddd49f6ff102 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 10 May 2024 19:34:52 +0300 Subject: [PATCH 175/237] more unalign --- src/strategies/avx2/intra-avx2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0e325a07..14b9637c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -551,7 +551,7 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } - __m128i tmp = _mm_load_si128((__m128i*)delta_int); + __m128i tmp = _mm_loadu_si128((__m128i*)delta_int); __m256i vidx = _mm256_cvtepi16_epi32(tmp); __m256i weights = _mm256_loadu_si256((__m256i*)f); @@ -598,8 +598,8 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } for (int x = 0; x < width; x += 16) { - __m128i tmp0 = _mm_load_si128((__m128i*)&delta_int[x]); - __m128i tmp1 = _mm_load_si128((__m128i*)&delta_int[x + 8]); + __m128i tmp0 = _mm_loadu_si128((__m128i*)&delta_int[x]); + __m128i tmp1 = _mm_loadu_si128((__m128i*)&delta_int[x + 8]); __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); From f4051c98619f356c90c059e289517c0690133c7b Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 30 May 2024 10:56:24 +0300 Subject: [PATCH 176/237] Fix two broken pdpc functions --- src/strategies/avx2/intra-avx2.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 14b9637c..f89b9360 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1796,21 +1796,21 @@ static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re // For width 8, height must be at least 2. Handle 2 lines at once. for (int y = 0; y < height; y += 2) { - /*ALIGNED(32) int16_t left[16] = { 0 }; + ALIGNED(32) int16_t left[16] = { 0 }; for (int yy = 0; yy < 2; ++yy) { for (int xx = 0; xx < limit; ++xx) { left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; } - }*/ - __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); - vleft = _mm_shuffle_epi8(vleft, vshuf); + } + //__m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + //vleft = _mm_shuffle_epi8(vleft, vshuf); __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - //__m256i vleft = _mm256_loadu_si256((__m256i*)left); + //__m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + __m256i accu = _mm256_sub_epi16(vleft, vdst16); accu = _mm256_mullo_epi16(vweight, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); @@ -4045,11 +4045,11 @@ static void uvg_pdpc_planar_dc_avx2( ); // TODO: replace latter log2_width with log2_height - const int scale = ((log2_width - 2 + log2_width - 2 + 2) >> 2); + const int scale = ((log2_width - 2 + log2_height - 2 + 2) >> 2); // Same weights regardless of axis, compute once int16_t w[LCU_WIDTH]; - for (int i = 0; i < width; i += 4) { + for (int i = 0; i < MAX(width, height); i += 4) { __m128i base = _mm_set1_epi32(i); __m128i offs = _mm_setr_epi32(0, 1, 2, 3); __m128i idxs = _mm_add_epi32(base, offs); From ac3ecf0cbc48b40a8a6c4be842224593148bffa0 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 30 May 2024 15:07:57 +0300 Subject: [PATCH 177/237] [avx2] Implement vectorized non fractional angle pixel copy. --- src/strategies/avx2/intra-avx2.c | 590 +++++++++++++++++++++++++++++-- 1 file changed, 570 insertions(+), 20 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f89b9360..72a49b29 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1314,20 +1314,6 @@ static void angular_pred_linear_filter_w16_hor_wide_angle_avx2(uvg_pixel* dst, u } -static void angular_pred_linear_filter_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) -{ - // 2-tap linear filter - - // Handle filtering in 4x4 blocks - for (int y = 0; y < height; y += 4) { - const __m256i vref = _mm256_loadu_si256((const __m256i*) & ref[y + 1]); - for (int x = 0; x < width; x += 4) { - - } - } -} - - static void angular_pred_non_fractional_angle_pxl_copy_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { // Note: this probably won't work for wide angle modes. @@ -1344,16 +1330,536 @@ static void angular_pred_non_fractional_angle_pxl_copy_ver_avx2(uvg_pixel* dst, } } -static void angular_pred_non_fractional_angle_pxl_copy_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) +static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height) { - // TODO: replace this generic solution after testing - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - dst[y * width + x] = ref[delta_int[x] + y + 1]; + // const int width = 4; + + const __m128i vrefshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06 + ); + + const __m128i vrefshuf1 = _mm_setr_epi8( + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a + ); + + // Handle as 4x4 blocks. There is no case where height < 4. + if (height == 4) { + // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. + __m128i vref = _mm_loadu_si128((__m128i*)&ref[2]); + vref = _mm_shuffle_epi8(vref, vrefshuf0); + + _mm_store_si128((__m128i*)dst, vref); + } + else { + // Can handle 8 rows at once + for (int y = 0; y < height; y += 8) { + + __m128i vref = _mm_loadu_si128((__m128i*)(ref + 2 + y)); + + __m128i vres0 = _mm_shuffle_epi8(vref, vrefshuf0); + __m128i vres1 = _mm_shuffle_epi8(vref, vrefshuf1); + + _mm_store_si128((__m128i*)(dst + 0), vres0); + _mm_store_si128((__m128i*)(dst + 16), vres1); + dst += 32; } } } +static void angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height) +{ + + const __m128i vrefshuf0 = _mm_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 + ); + + const __m128i vrefshuf1 = _mm_setr_epi8( + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a + ); + + const __m128i vrefshuf2 = _mm_setr_epi8( + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c + ); + + const __m128i vrefshuf3 = _mm_setr_epi8( + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e + ); + + // Can handle 8 rows at once. There is no case for height 2 and 4, this function is not reached in those cases. + for (int y = 0; y < height; y += 8) { + // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. + __m128i vref = _mm_loadu_si128((__m128i*)(ref + 2 + y)); + _mm_store_si128((__m128i*)(dst + 0), _mm_shuffle_epi8(vref, vrefshuf0)); + _mm_store_si128((__m128i*)(dst + 16), _mm_shuffle_epi8(vref, vrefshuf1)); + _mm_store_si128((__m128i*)(dst + 32), _mm_shuffle_epi8(vref, vrefshuf2)); + _mm_store_si128((__m128i*)(dst + 48), _mm_shuffle_epi8(vref, vrefshuf3)); + dst += 64; + } +} + +static void angular_pred_non_fractional_angle_pxl_copy_w4_wide_angle_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int) +{ + // const int width = 4; + + const __m128i vtranspose = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + + //__m128i vidx = _mm_setr_epi32(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); + __m128i vidx = _mm_load_si128((__m128i*)delta_int); + vidx = _mm_cvtepi16_epi32(vidx); + + // Handle as 4x4 blocks. There is no case where height < 4. + for (int y = 0; y < height; y += 4) { + // Offset indices by one since index 0 is top left. + + __m128i vref = _mm_i32gather_epi32((const int*)(ref + y + 1), vidx, 1); + + vref = _mm_shuffle_epi8(vref, vtranspose); + + _mm_store_si128((__m128i*)dst, vref); + dst += 16; + } +} + +static void angular_pred_non_fractional_angle_pxl_copy_w8_wide_angle_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int) +{ + // const int width = 8; + + // Place the next 4 16-bit delta int values in the lower half of the register. + const __m128i vidxshuf = _mm_setr_epi8( + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff // Don't care. + ); + + // 1st step of the transpose + const __m256i vtranspose0 = _mm256_setr_epi8( + 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, + 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f, + 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, + 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f + ); + + // 3rd step of the transpose, after permute4x64_epi64 + const __m256i vtranspose1 = _mm256_setr_epi8( + 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, + 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f, + 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, + 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f + ); + + const __m128i vidx = _mm_loadu_si128((__m128i*)delta_int); + const __m256i vidx0 = _mm256_cvtepi16_epi64(vidx); + const __m256i vidx1 = _mm256_cvtepi16_epi64(_mm_shuffle_epi8(vidx, vidxshuf)); + + // Handle as 8x8 blocks. + for (int y = 0; y < height; y += 8) { + __m256i vref0 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx0, 1); + __m256i vref1 = _mm256_i64gather_epi64((const long long*)&ref[y + 1], vidx1, 1); + + // Transpose the 8x8 block + vref0 = _mm256_shuffle_epi8(vref0, vtranspose0); + vref1 = _mm256_shuffle_epi8(vref1, vtranspose0); + + vref0 = _mm256_permute4x64_epi64(vref0, _MM_SHUFFLE(3, 1, 2, 0)); + vref1 = _mm256_permute4x64_epi64(vref1, _MM_SHUFFLE(3, 1, 2, 0)); + + vref0 = _mm256_shuffle_epi8(vref0, vtranspose1); + vref1 = _mm256_shuffle_epi8(vref1, vtranspose1); + + __m256i vlo32 = _mm256_unpacklo_epi32(vref0, vref1); + __m256i vhi32 = _mm256_unpackhi_epi32(vref0, vref1); + + __m256i vfinal0 = _mm256_permute2x128_si256(vlo32, vhi32, 0x20); + __m256i vfinal1 = _mm256_permute2x128_si256(vlo32, vhi32, 0x31); + + _mm256_store_si256((__m256i*)(dst + 0), vfinal0); + _mm256_store_si256((__m256i*)(dst + 32), vfinal1); + + dst += 64; + } +} + +static void angular_pred_non_fractional_angle_pxl_copy_w16_wide_angle_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int) +{ + // const int width = 16; + + // Handle as 16x16 blocks. This function can handle widths from 16 onwards. + for (int y = 0; y < height; y += 16) { + // Offset indices by one since ref[0] is top left. + __m128i vref0 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x00])); + __m128i vref1 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x01])); + __m128i vref2 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x02])); + __m128i vref3 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x03])); + __m128i vref4 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x04])); + __m128i vref5 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x05])); + __m128i vref6 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x06])); + __m128i vref7 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x07])); + + __m128i vref8 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x08])); + __m128i vref9 = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x09])); + __m128i vrefa = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x0a])); + __m128i vrefb = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x0b])); + __m128i vrefc = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x0c])); + __m128i vrefd = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x0d])); + __m128i vrefe = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x0e])); + __m128i vreff = _mm_loadu_si128((__m128i*)(ref + y + 1 + delta_int[0x0f])); + + // The result is just a transpose of the 16x16 block. + __m128i vlo8_0 = _mm_unpacklo_epi8(vref0, vref1); + __m128i vlo8_1 = _mm_unpacklo_epi8(vref2, vref3); + __m128i vlo8_2 = _mm_unpacklo_epi8(vref4, vref5); + __m128i vlo8_3 = _mm_unpacklo_epi8(vref6, vref7); + __m128i vlo8_4 = _mm_unpacklo_epi8(vref8, vref9); + __m128i vlo8_5 = _mm_unpacklo_epi8(vrefa, vrefb); + __m128i vlo8_6 = _mm_unpacklo_epi8(vrefc, vrefd); + __m128i vlo8_7 = _mm_unpacklo_epi8(vrefe, vreff); + + __m128i vhi8_0 = _mm_unpackhi_epi8(vref0, vref1); + __m128i vhi8_1 = _mm_unpackhi_epi8(vref2, vref3); + __m128i vhi8_2 = _mm_unpackhi_epi8(vref4, vref5); + __m128i vhi8_3 = _mm_unpackhi_epi8(vref6, vref7); + __m128i vhi8_4 = _mm_unpackhi_epi8(vref8, vref9); + __m128i vhi8_5 = _mm_unpackhi_epi8(vrefa, vrefb); + __m128i vhi8_6 = _mm_unpackhi_epi8(vrefc, vrefd); + __m128i vhi8_7 = _mm_unpackhi_epi8(vrefe, vreff); + + __m128i vlo16_0 = _mm_unpacklo_epi16(vlo8_0, vlo8_1); + __m128i vlo16_1 = _mm_unpacklo_epi16(vlo8_2, vlo8_3); + __m128i vlo16_2 = _mm_unpacklo_epi16(vhi8_0, vhi8_1); + __m128i vlo16_3 = _mm_unpacklo_epi16(vhi8_2, vhi8_3); + __m128i vlo16_4 = _mm_unpacklo_epi16(vlo8_4, vlo8_5); + __m128i vlo16_5 = _mm_unpacklo_epi16(vlo8_6, vlo8_7); + __m128i vlo16_6 = _mm_unpacklo_epi16(vhi8_4, vhi8_5); + __m128i vlo16_7 = _mm_unpacklo_epi16(vhi8_6, vhi8_7); + + + __m128i vhi16_0 = _mm_unpackhi_epi16(vlo8_0, vlo8_1); + __m128i vhi16_1 = _mm_unpackhi_epi16(vlo8_2, vlo8_3); + __m128i vhi16_2 = _mm_unpackhi_epi16(vhi8_0, vhi8_1); + __m128i vhi16_3 = _mm_unpackhi_epi16(vhi8_2, vhi8_3); + __m128i vhi16_4 = _mm_unpackhi_epi16(vlo8_4, vlo8_5); + __m128i vhi16_5 = _mm_unpackhi_epi16(vlo8_6, vlo8_7); + __m128i vhi16_6 = _mm_unpackhi_epi16(vhi8_4, vhi8_5); + __m128i vhi16_7 = _mm_unpackhi_epi16(vhi8_6, vhi8_7); + + __m128i vlo32_0 = _mm_unpacklo_epi32(vlo16_0, vlo16_1); + __m128i vlo32_1 = _mm_unpacklo_epi32(vlo16_2, vlo16_3); + __m128i vlo32_2 = _mm_unpacklo_epi32(vhi16_0, vhi16_1); + __m128i vlo32_3 = _mm_unpacklo_epi32(vhi16_2, vhi16_3); + __m128i vlo32_4 = _mm_unpacklo_epi32(vlo16_4, vlo16_5); + __m128i vlo32_5 = _mm_unpacklo_epi32(vlo16_6, vlo16_7); + __m128i vlo32_6 = _mm_unpacklo_epi32(vhi16_4, vhi16_5); + __m128i vlo32_7 = _mm_unpacklo_epi32(vhi16_6, vhi16_7); + + __m128i vhi32_0 = _mm_unpackhi_epi32(vlo16_0, vlo16_1); + __m128i vhi32_1 = _mm_unpackhi_epi32(vlo16_2, vlo16_3); + __m128i vhi32_2 = _mm_unpackhi_epi32(vhi16_0, vhi16_1); + __m128i vhi32_3 = _mm_unpackhi_epi32(vhi16_2, vhi16_3); + __m128i vhi32_4 = _mm_unpackhi_epi32(vlo16_4, vlo16_5); + __m128i vhi32_5 = _mm_unpackhi_epi32(vlo16_6, vlo16_7); + __m128i vhi32_6 = _mm_unpackhi_epi32(vhi16_4, vhi16_5); + __m128i vhi32_7 = _mm_unpackhi_epi32(vhi16_6, vhi16_7); + + __m128i vrow0 = _mm_unpacklo_epi64(vlo32_0, vlo32_4); + __m128i vrow1 = _mm_unpackhi_epi64(vlo32_0, vlo32_4); + __m128i vrow2 = _mm_unpacklo_epi64(vhi32_0, vhi32_4); + __m128i vrow3 = _mm_unpackhi_epi64(vhi32_0, vhi32_4); + __m128i vrow4 = _mm_unpacklo_epi64(vlo32_2, vlo32_6); + __m128i vrow5 = _mm_unpackhi_epi64(vlo32_2, vlo32_6); + __m128i vrow6 = _mm_unpacklo_epi64(vhi32_2, vhi32_6); + __m128i vrow7 = _mm_unpackhi_epi64(vhi32_2, vhi32_6); + + __m128i vrow8 = _mm_unpacklo_epi64(vlo32_1, vlo32_5); + __m128i vrwo9 = _mm_unpackhi_epi64(vlo32_1, vlo32_5); + __m128i vrowa = _mm_unpacklo_epi64(vhi32_1, vhi32_5); + __m128i vrowb = _mm_unpackhi_epi64(vhi32_1, vhi32_5); + __m128i vrowc = _mm_unpacklo_epi64(vlo32_3, vlo32_7); + __m128i vrowd = _mm_unpackhi_epi64(vlo32_3, vlo32_7); + __m128i vrowe = _mm_unpacklo_epi64(vhi32_3, vhi32_7); + __m128i vrowf = _mm_unpackhi_epi64(vhi32_3, vhi32_7); + + _mm_store_si128((__m128i*)(dst + 0), vrow0); + _mm_store_si128((__m128i*)(dst + 16), vrow1); + _mm_store_si128((__m128i*)(dst + 32), vrow2); + _mm_store_si128((__m128i*)(dst + 48), vrow3); + _mm_store_si128((__m128i*)(dst + 64), vrow4); + _mm_store_si128((__m128i*)(dst + 80), vrow5); + _mm_store_si128((__m128i*)(dst + 96), vrow6); + _mm_store_si128((__m128i*)(dst + 112), vrow7); + + _mm_store_si128((__m128i*)(dst + 128), vrow8); + _mm_store_si128((__m128i*)(dst + 144), vrwo9); + _mm_store_si128((__m128i*)(dst + 160), vrowa); + _mm_store_si128((__m128i*)(dst + 176), vrowb); + _mm_store_si128((__m128i*)(dst + 192), vrowc); + _mm_store_si128((__m128i*)(dst + 208), vrowd); + _mm_store_si128((__m128i*)(dst + 224), vrowe); + _mm_store_si128((__m128i*)(dst + 240), vrowf); + + dst += 256; + } +} + +static void angular_pred_non_fractional_angle_pxl_copy_w32_wide_angle_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int) +{ + // const int width = 32; + // Handle as 32x32 blocks. Similarly to the w16 version, this is also just a transpose of the 32x32 block. + // TODO: if this is too slow, consider doing it in 16x16 blocks. There will be a lot of moving data between registers in this solution. + for (int y = 0; y < height; y += 32) { + // Offset indices by one since ref[0] is top left. + __m256i vref00 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x00])); + __m256i vref01 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x01])); + __m256i vref02 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x02])); + __m256i vref03 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x03])); + __m256i vref04 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x04])); + __m256i vref05 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x05])); + __m256i vref06 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x06])); + __m256i vref07 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x07])); + + __m256i vref08 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x08])); + __m256i vref09 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x09])); + __m256i vref0a = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x0a])); + __m256i vref0b = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x0b])); + __m256i vref0c = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x0c])); + __m256i vref0d = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x0d])); + __m256i vref0e = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x0e])); + __m256i vref0f = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x0f])); + + __m256i vref10 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x10])); + __m256i vref11 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x11])); + __m256i vref12 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x12])); + __m256i vref13 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x13])); + __m256i vref14 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x14])); + __m256i vref15 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x15])); + __m256i vref16 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x16])); + __m256i vref17 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x17])); + + __m256i vref18 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x18])); + __m256i vref19 = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x19])); + __m256i vref1a = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x1a])); + __m256i vref1b = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x1b])); + __m256i vref1c = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x1c])); + __m256i vref1d = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x1d])); + __m256i vref1e = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x1e])); + __m256i vref1f = _mm256_loadu_si256((__m256i*)(ref + y + 1 + delta_int[0x1f])); + + __m256i vlo8_0 = _mm256_unpacklo_epi8(vref00, vref01); + __m256i vlo8_1 = _mm256_unpacklo_epi8(vref02, vref03); + __m256i vlo8_2 = _mm256_unpacklo_epi8(vref04, vref05); + __m256i vlo8_3 = _mm256_unpacklo_epi8(vref06, vref07); + __m256i vlo8_4 = _mm256_unpacklo_epi8(vref08, vref09); + __m256i vlo8_5 = _mm256_unpacklo_epi8(vref0a, vref0b); + __m256i vlo8_6 = _mm256_unpacklo_epi8(vref0c, vref0d); + __m256i vlo8_7 = _mm256_unpacklo_epi8(vref0e, vref0f); + __m256i vlo8_8 = _mm256_unpacklo_epi8(vref10, vref11); + __m256i vlo8_9 = _mm256_unpacklo_epi8(vref12, vref13); + __m256i vlo8_a = _mm256_unpacklo_epi8(vref14, vref15); + __m256i vlo8_b = _mm256_unpacklo_epi8(vref16, vref17); + __m256i vlo8_c = _mm256_unpacklo_epi8(vref18, vref19); + __m256i vlo8_d = _mm256_unpacklo_epi8(vref1a, vref1b); + __m256i vlo8_e = _mm256_unpacklo_epi8(vref1c, vref1d); + __m256i vlo8_f = _mm256_unpacklo_epi8(vref1e, vref1f); + + __m256i vhi8_0 = _mm256_unpackhi_epi8(vref00, vref01); + __m256i vhi8_1 = _mm256_unpackhi_epi8(vref02, vref03); + __m256i vhi8_2 = _mm256_unpackhi_epi8(vref04, vref05); + __m256i vhi8_3 = _mm256_unpackhi_epi8(vref06, vref07); + __m256i vhi8_4 = _mm256_unpackhi_epi8(vref08, vref09); + __m256i vhi8_5 = _mm256_unpackhi_epi8(vref0a, vref0b); + __m256i vhi8_6 = _mm256_unpackhi_epi8(vref0c, vref0d); + __m256i vhi8_7 = _mm256_unpackhi_epi8(vref0e, vref0f); + __m256i vhi8_8 = _mm256_unpackhi_epi8(vref10, vref11); + __m256i vhi8_9 = _mm256_unpackhi_epi8(vref12, vref13); + __m256i vhi8_a = _mm256_unpackhi_epi8(vref14, vref15); + __m256i vhi8_b = _mm256_unpackhi_epi8(vref16, vref17); + __m256i vhi8_c = _mm256_unpackhi_epi8(vref18, vref19); + __m256i vhi8_d = _mm256_unpackhi_epi8(vref1a, vref1b); + __m256i vhi8_e = _mm256_unpackhi_epi8(vref1c, vref1d); + __m256i vhi8_f = _mm256_unpackhi_epi8(vref1e, vref1f); + + __m256i vlo16_0 = _mm256_unpacklo_epi16(vlo8_0, vlo8_1); + __m256i vlo16_1 = _mm256_unpacklo_epi16(vlo8_2, vlo8_3); + __m256i vlo16_2 = _mm256_unpacklo_epi16(vlo8_4, vlo8_5); + __m256i vlo16_3 = _mm256_unpacklo_epi16(vlo8_6, vlo8_7); + __m256i vlo16_4 = _mm256_unpacklo_epi16(vlo8_8, vlo8_9); + __m256i vlo16_5 = _mm256_unpacklo_epi16(vlo8_a, vlo8_b); + __m256i vlo16_6 = _mm256_unpacklo_epi16(vlo8_c, vlo8_d); + __m256i vlo16_7 = _mm256_unpacklo_epi16(vlo8_e, vlo8_f); + __m256i vlo16_8 = _mm256_unpacklo_epi16(vhi8_0, vhi8_1); + __m256i vlo16_9 = _mm256_unpacklo_epi16(vhi8_2, vhi8_3); + __m256i vlo16_a = _mm256_unpacklo_epi16(vhi8_4, vhi8_5); + __m256i vlo16_b = _mm256_unpacklo_epi16(vhi8_6, vhi8_7); + __m256i vlo16_c = _mm256_unpacklo_epi16(vhi8_8, vhi8_9); + __m256i vlo16_d = _mm256_unpacklo_epi16(vhi8_a, vhi8_b); + __m256i vlo16_e = _mm256_unpacklo_epi16(vhi8_c, vhi8_d); + __m256i vlo16_f = _mm256_unpacklo_epi16(vhi8_e, vhi8_f); + + __m256i vhi16_0 = _mm256_unpackhi_epi16(vlo8_0, vlo8_1); + __m256i vhi16_1 = _mm256_unpackhi_epi16(vlo8_2, vlo8_3); + __m256i vhi16_2 = _mm256_unpackhi_epi16(vlo8_4, vlo8_5); + __m256i vhi16_3 = _mm256_unpackhi_epi16(vlo8_6, vlo8_7); + __m256i vhi16_4 = _mm256_unpackhi_epi16(vlo8_8, vlo8_9); + __m256i vhi16_5 = _mm256_unpackhi_epi16(vlo8_a, vlo8_b); + __m256i vhi16_6 = _mm256_unpackhi_epi16(vlo8_c, vlo8_d); + __m256i vhi16_7 = _mm256_unpackhi_epi16(vlo8_e, vlo8_f); + __m256i vhi16_8 = _mm256_unpackhi_epi16(vhi8_0, vhi8_1); + __m256i vhi16_9 = _mm256_unpackhi_epi16(vhi8_2, vhi8_3); + __m256i vhi16_a = _mm256_unpackhi_epi16(vhi8_4, vhi8_5); + __m256i vhi16_b = _mm256_unpackhi_epi16(vhi8_6, vhi8_7); + __m256i vhi16_c = _mm256_unpackhi_epi16(vhi8_8, vhi8_9); + __m256i vhi16_d = _mm256_unpackhi_epi16(vhi8_a, vhi8_b); + __m256i vhi16_e = _mm256_unpackhi_epi16(vhi8_c, vhi8_d); + __m256i vhi16_f = _mm256_unpackhi_epi16(vhi8_e, vhi8_f); + + __m256i vlo32_0 = _mm256_unpacklo_epi32(vlo16_0, vlo16_1); + __m256i vlo32_1 = _mm256_unpacklo_epi32(vlo16_2, vlo16_3); + __m256i vlo32_2 = _mm256_unpacklo_epi32(vlo16_4, vlo16_5); + __m256i vlo32_3 = _mm256_unpacklo_epi32(vlo16_6, vlo16_7); + __m256i vlo32_4 = _mm256_unpacklo_epi32(vhi16_0, vhi16_1); + __m256i vlo32_5 = _mm256_unpacklo_epi32(vhi16_2, vhi16_3); + __m256i vlo32_6 = _mm256_unpacklo_epi32(vhi16_4, vhi16_5); + __m256i vlo32_7 = _mm256_unpacklo_epi32(vhi16_6, vhi16_7); + __m256i vlo32_8 = _mm256_unpacklo_epi32(vlo16_8, vlo16_9); + __m256i vlo32_9 = _mm256_unpacklo_epi32(vlo16_a, vlo16_b); + __m256i vlo32_a = _mm256_unpacklo_epi32(vlo16_c, vlo16_d); + __m256i vlo32_b = _mm256_unpacklo_epi32(vlo16_e, vlo16_f); + __m256i vlo32_c = _mm256_unpacklo_epi32(vhi16_8, vhi16_9); + __m256i vlo32_d = _mm256_unpacklo_epi32(vhi16_a, vhi16_b); + __m256i vlo32_e = _mm256_unpacklo_epi32(vhi16_c, vhi16_d); + __m256i vlo32_f = _mm256_unpacklo_epi32(vhi16_e, vhi16_f); + + __m256i vhi32_0 = _mm256_unpackhi_epi32(vlo16_0, vlo16_1); + __m256i vhi32_1 = _mm256_unpackhi_epi32(vlo16_2, vlo16_3); + __m256i vhi32_2 = _mm256_unpackhi_epi32(vlo16_4, vlo16_5); + __m256i vhi32_3 = _mm256_unpackhi_epi32(vlo16_6, vlo16_7); + __m256i vhi32_4 = _mm256_unpackhi_epi32(vhi16_0, vhi16_1); + __m256i vhi32_5 = _mm256_unpackhi_epi32(vhi16_2, vhi16_3); + __m256i vhi32_6 = _mm256_unpackhi_epi32(vhi16_4, vhi16_5); + __m256i vhi32_7 = _mm256_unpackhi_epi32(vhi16_6, vhi16_7); + __m256i vhi32_8 = _mm256_unpackhi_epi32(vlo16_8, vlo16_9); + __m256i vhi32_9 = _mm256_unpackhi_epi32(vlo16_a, vlo16_b); + __m256i vhi32_a = _mm256_unpackhi_epi32(vlo16_c, vlo16_d); + __m256i vhi32_b = _mm256_unpackhi_epi32(vlo16_e, vlo16_f); + __m256i vhi32_c = _mm256_unpackhi_epi32(vhi16_8, vhi16_9); + __m256i vhi32_d = _mm256_unpackhi_epi32(vhi16_a, vhi16_b); + __m256i vhi32_e = _mm256_unpackhi_epi32(vhi16_c, vhi16_d); + __m256i vhi32_f = _mm256_unpackhi_epi32(vhi16_e, vhi16_f); + + __m256i vlo64_0 = _mm256_unpacklo_epi64(vlo32_0, vlo32_1); + __m256i vlo64_1 = _mm256_unpacklo_epi64(vlo32_2, vlo32_3); + __m256i vlo64_2 = _mm256_unpacklo_epi64(vhi32_0, vhi32_1); + __m256i vlo64_3 = _mm256_unpacklo_epi64(vhi32_2, vhi32_3); + __m256i vlo64_4 = _mm256_unpacklo_epi64(vlo32_4, vlo32_5); + __m256i vlo64_5 = _mm256_unpacklo_epi64(vlo32_6, vlo32_7); + __m256i vlo64_6 = _mm256_unpacklo_epi64(vhi32_4, vhi32_5); + __m256i vlo64_7 = _mm256_unpacklo_epi64(vhi32_6, vhi32_7); + __m256i vlo64_8 = _mm256_unpacklo_epi64(vlo32_8, vlo32_9); + __m256i vlo64_9 = _mm256_unpacklo_epi64(vlo32_a, vlo32_b); + __m256i vlo64_a = _mm256_unpacklo_epi64(vhi32_8, vhi32_9); + __m256i vlo64_b = _mm256_unpacklo_epi64(vhi32_a, vhi32_b); + __m256i vlo64_c = _mm256_unpacklo_epi64(vlo32_c, vlo32_d); + __m256i vlo64_d = _mm256_unpacklo_epi64(vlo32_e, vlo32_f); + __m256i vlo64_e = _mm256_unpacklo_epi64(vhi32_c, vhi32_d); + __m256i vlo64_f = _mm256_unpacklo_epi64(vhi32_e, vhi32_f); + + __m256i vhi64_0 = _mm256_unpackhi_epi64(vlo32_0, vlo32_1); + __m256i vhi64_1 = _mm256_unpackhi_epi64(vlo32_2, vlo32_3); + __m256i vhi64_2 = _mm256_unpackhi_epi64(vhi32_0, vhi32_1); + __m256i vhi64_3 = _mm256_unpackhi_epi64(vhi32_2, vhi32_3); + __m256i vhi64_4 = _mm256_unpackhi_epi64(vlo32_4, vlo32_5); + __m256i vhi64_5 = _mm256_unpackhi_epi64(vlo32_6, vlo32_7); + __m256i vhi64_6 = _mm256_unpackhi_epi64(vhi32_4, vhi32_5); + __m256i vhi64_7 = _mm256_unpackhi_epi64(vhi32_6, vhi32_7); + __m256i vhi64_8 = _mm256_unpackhi_epi64(vlo32_8, vlo32_9); + __m256i vhi64_9 = _mm256_unpackhi_epi64(vlo32_a, vlo32_b); + __m256i vhi64_a = _mm256_unpackhi_epi64(vhi32_8, vhi32_9); + __m256i vhi64_b = _mm256_unpackhi_epi64(vhi32_a, vhi32_b); + __m256i vhi64_c = _mm256_unpackhi_epi64(vlo32_c, vlo32_d); + __m256i vhi64_d = _mm256_unpackhi_epi64(vlo32_e, vlo32_f); + __m256i vhi64_e = _mm256_unpackhi_epi64(vhi32_c, vhi32_d); + __m256i vhi64_f = _mm256_unpackhi_epi64(vhi32_e, vhi32_f); + + __m256i vrow00 = _mm256_permute2x128_si256(vlo64_0, vlo64_1, 0x20); + __m256i vrow01 = _mm256_permute2x128_si256(vhi64_0, vhi64_1, 0x20); + __m256i vrow02 = _mm256_permute2x128_si256(vlo64_2, vlo64_3, 0x20); + __m256i vrow03 = _mm256_permute2x128_si256(vhi64_2, vhi64_3, 0x20); + __m256i vrow04 = _mm256_permute2x128_si256(vlo64_4, vlo64_5, 0x20); + __m256i vrow05 = _mm256_permute2x128_si256(vhi64_4, vhi64_5, 0x20); + __m256i vrow06 = _mm256_permute2x128_si256(vlo64_6, vlo64_7, 0x20); + __m256i vrow07 = _mm256_permute2x128_si256(vhi64_6, vhi64_7, 0x20); + + __m256i vrow08 = _mm256_permute2x128_si256(vlo64_8, vlo64_9, 0x20); + __m256i vrow09 = _mm256_permute2x128_si256(vhi64_8, vhi64_9, 0x20); + __m256i vrow0a = _mm256_permute2x128_si256(vlo64_a, vlo64_b, 0x20); + __m256i vrow0b = _mm256_permute2x128_si256(vhi64_a, vhi64_b, 0x20); + __m256i vrow0c = _mm256_permute2x128_si256(vlo64_c, vlo64_d, 0x20); + __m256i vrow0d = _mm256_permute2x128_si256(vhi64_c, vhi64_d, 0x20); + __m256i vrow0e = _mm256_permute2x128_si256(vlo64_e, vlo64_f, 0x20); + __m256i vrow0f = _mm256_permute2x128_si256(vhi64_e, vhi64_f, 0x20); + + __m256i vrow10 = _mm256_permute2x128_si256(vlo64_0, vlo64_1, 0x31); + __m256i vrow11 = _mm256_permute2x128_si256(vhi64_0, vhi64_1, 0x31); + __m256i vrow12 = _mm256_permute2x128_si256(vlo64_2, vlo64_3, 0x31); + __m256i vrow13 = _mm256_permute2x128_si256(vhi64_2, vhi64_3, 0x31); + __m256i vrow14 = _mm256_permute2x128_si256(vlo64_4, vlo64_5, 0x31); + __m256i vrow15 = _mm256_permute2x128_si256(vhi64_4, vhi64_5, 0x31); + __m256i vrow16 = _mm256_permute2x128_si256(vlo64_6, vlo64_7, 0x31); + __m256i vrow17 = _mm256_permute2x128_si256(vhi64_6, vhi64_7, 0x31); + + __m256i vrow18 = _mm256_permute2x128_si256(vlo64_8, vlo64_9, 0x31); + __m256i vrow19 = _mm256_permute2x128_si256(vhi64_8, vhi64_9, 0x31); + __m256i vrow1a = _mm256_permute2x128_si256(vlo64_a, vlo64_b, 0x31); + __m256i vrow1b = _mm256_permute2x128_si256(vhi64_a, vhi64_b, 0x31); + __m256i vrow1c = _mm256_permute2x128_si256(vlo64_c, vlo64_d, 0x31); + __m256i vrow1d = _mm256_permute2x128_si256(vhi64_c, vhi64_d, 0x31); + __m256i vrow1e = _mm256_permute2x128_si256(vlo64_e, vlo64_f, 0x31); + __m256i vrow1f = _mm256_permute2x128_si256(vhi64_e, vhi64_f, 0x31); + + _mm256_store_si256((__m256i*)(dst + 0), vrow00); + _mm256_store_si256((__m256i*)(dst + 32), vrow01); + _mm256_store_si256((__m256i*)(dst + 64), vrow02); + _mm256_store_si256((__m256i*)(dst + 96), vrow03); + _mm256_store_si256((__m256i*)(dst + 128), vrow04); + _mm256_store_si256((__m256i*)(dst + 160), vrow05); + _mm256_store_si256((__m256i*)(dst + 192), vrow06); + _mm256_store_si256((__m256i*)(dst + 224), vrow07); + _mm256_store_si256((__m256i*)(dst + 256), vrow08); + _mm256_store_si256((__m256i*)(dst + 288), vrow09); + _mm256_store_si256((__m256i*)(dst + 320), vrow0a); + _mm256_store_si256((__m256i*)(dst + 352), vrow0b); + _mm256_store_si256((__m256i*)(dst + 384), vrow0c); + _mm256_store_si256((__m256i*)(dst + 416), vrow0d); + _mm256_store_si256((__m256i*)(dst + 448), vrow0e); + _mm256_store_si256((__m256i*)(dst + 480), vrow0f); + _mm256_store_si256((__m256i*)(dst + 512), vrow10); + _mm256_store_si256((__m256i*)(dst + 544), vrow11); + _mm256_store_si256((__m256i*)(dst + 576), vrow12); + _mm256_store_si256((__m256i*)(dst + 608), vrow13); + _mm256_store_si256((__m256i*)(dst + 640), vrow14); + _mm256_store_si256((__m256i*)(dst + 672), vrow15); + _mm256_store_si256((__m256i*)(dst + 704), vrow16); + _mm256_store_si256((__m256i*)(dst + 736), vrow17); + _mm256_store_si256((__m256i*)(dst + 768), vrow18); + _mm256_store_si256((__m256i*)(dst + 800), vrow19); + _mm256_store_si256((__m256i*)(dst + 832), vrow1a); + _mm256_store_si256((__m256i*)(dst + 864), vrow1b); + _mm256_store_si256((__m256i*)(dst + 896), vrow1c); + _mm256_store_si256((__m256i*)(dst + 928), vrow1d); + _mm256_store_si256((__m256i*)(dst + 960), vrow1e); + _mm256_store_si256((__m256i*)(dst + 992), vrow1f); + + dst += 1024; + } +} + static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { @@ -2935,7 +3441,51 @@ static void uvg_angular_pred_avx2( angular_pred_non_fractional_angle_pxl_copy_ver_avx2(dst, ref_main, width, height, delta_int); } else { - angular_pred_non_fractional_angle_pxl_copy_hor_avx2(dst, ref_main, width, height, delta_int); + if (pred_mode == 2) { + switch (width) { + // Note: these functions do not need the delta int table as the mode is known + case 4: angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(dst, ref_main, height); break; + case 8: angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(dst, ref_main, height); break; + // Cases 16 onward can be solved with a simple memcpy + case 16: + for (int y = 0; y < height; ++y) { + // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. + memcpy(&dst[y * 16], &ref_main[2 + y], 16 * sizeof(uvg_pixel)); + } + break; + case 32: + for (int y = 0; y < height; ++y) { + memcpy(&dst[y * 32], &ref_main[2 + y], 32 * sizeof(uvg_pixel)); + } + break; + case 64: + for (int y = 0; y < height; ++y) { + memcpy(&dst[y * 64], &ref_main[2 + y], 64 * sizeof(uvg_pixel)); + } + break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; + } + + } + else { + // Wide angle modes -12, -10, -8 and -4 + switch (width) { + case 4: angular_pred_non_fractional_angle_pxl_copy_w4_wide_angle_hor_avx2(dst, ref_main, height, delta_int); break; + case 8: angular_pred_non_fractional_angle_pxl_copy_w8_wide_angle_hor_avx2(dst, ref_main, height, delta_int); break; + case 16: angular_pred_non_fractional_angle_pxl_copy_w16_wide_angle_hor_avx2(dst, ref_main, height, delta_int); break; + case 32: angular_pred_non_fractional_angle_pxl_copy_w32_wide_angle_hor_avx2(dst, ref_main, height, delta_int); break; + // Width 64 never goes into this branch. Leave an assert here to catch future problems. + case 64: + //angular_pred_non_fractional_angle_pxl_copy_hor_avx2(dst, ref_main, width, height, delta_int); break; + assert(false && "Intra angular predicion: Non fractional angle pixel copy with width 64. This should never happen.\n"); + break; + default: + assert(false && "Intra angular predicion: illegal width.\n"); + break; + } + } } } } From 40a682516ce06cd240801d71f1f529fe22a62dd6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 3 Jun 2024 13:20:14 +0300 Subject: [PATCH 178/237] Fix error caused by multi ref line. --- src/strategies/avx2/intra-avx2.c | 24 +-- src/strategies/avx2/intra_avx2_tables.h | 213 ++++++++++++------------ 2 files changed, 122 insertions(+), 115 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 72a49b29..2ea0f4d7 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1330,7 +1330,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_ver_avx2(uvg_pixel* dst, } } -static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height) +static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int multi_ref_offset) { // const int width = 4; @@ -1347,7 +1347,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pix // Handle as 4x4 blocks. There is no case where height < 4. if (height == 4) { // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. - __m128i vref = _mm_loadu_si128((__m128i*)&ref[2]); + __m128i vref = _mm_loadu_si128((__m128i*)&ref[2] + multi_ref_offset); vref = _mm_shuffle_epi8(vref, vrefshuf0); _mm_store_si128((__m128i*)dst, vref); @@ -1356,7 +1356,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pix // Can handle 8 rows at once for (int y = 0; y < height; y += 8) { - __m128i vref = _mm_loadu_si128((__m128i*)(ref + 2 + y)); + __m128i vref = _mm_loadu_si128((__m128i*)(ref + 2 + y + multi_ref_offset)); __m128i vres0 = _mm_shuffle_epi8(vref, vrefshuf0); __m128i vres1 = _mm_shuffle_epi8(vref, vrefshuf1); @@ -1368,7 +1368,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pix } } -static void angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height) +static void angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int multi_ref_offset) { const __m128i vrefshuf0 = _mm_setr_epi8( @@ -1394,7 +1394,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(uvg_pix // Can handle 8 rows at once. There is no case for height 2 and 4, this function is not reached in those cases. for (int y = 0; y < height; y += 8) { // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. - __m128i vref = _mm_loadu_si128((__m128i*)(ref + 2 + y)); + __m128i vref = _mm_loadu_si128((__m128i*)(ref + 2 + y + multi_ref_offset)); _mm_store_si128((__m128i*)(dst + 0), _mm_shuffle_epi8(vref, vrefshuf0)); _mm_store_si128((__m128i*)(dst + 16), _mm_shuffle_epi8(vref, vrefshuf1)); _mm_store_si128((__m128i*)(dst + 32), _mm_shuffle_epi8(vref, vrefshuf2)); @@ -3357,7 +3357,7 @@ static void uvg_angular_pred_avx2( // The mode is not horizontal or vertical, we have to do interpolation. // Set delta table pointers - const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 13) * 64 : (81 - pred_mode) * 64) : (pred_mode <= 34 ? (pred_mode - 2) * 64 : (66 - pred_mode) * 64); + const int table_offset = wide_angle_mode ? (pred_mode < 2 ? (pred_mode + 13) * DELTA_TABLE_ROW_LENGTH : (81 - pred_mode) * DELTA_TABLE_ROW_LENGTH) : (pred_mode <= 34 ? (pred_mode - 2) * DELTA_TABLE_ROW_LENGTH : (66 - pred_mode) * DELTA_TABLE_ROW_LENGTH); const int16_t* delta_int = wide_angle_mode ? &delta_int_wide_angle_table[table_offset] : &delta_int_table[table_offset]; delta_int += multi_ref_index; // TODO: This are not necessarily large enough for 64 dimension blocks const int16_t* delta_fract = wide_angle_mode ? &delta_fract_wide_angle_table[table_offset] : &delta_fract_table[table_offset]; @@ -3444,23 +3444,23 @@ static void uvg_angular_pred_avx2( if (pred_mode == 2) { switch (width) { // Note: these functions do not need the delta int table as the mode is known - case 4: angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(dst, ref_main, height); break; - case 8: angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(dst, ref_main, height); break; + case 4: angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(dst, ref_main, height, multi_ref_index); break; + case 8: angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(dst, ref_main, height, multi_ref_index); break; // Cases 16 onward can be solved with a simple memcpy case 16: for (int y = 0; y < height; ++y) { // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. - memcpy(&dst[y * 16], &ref_main[2 + y], 16 * sizeof(uvg_pixel)); + memcpy(&dst[y * 16], &ref_main[2 + y + multi_ref_index], 16 * sizeof(uvg_pixel)); } break; case 32: for (int y = 0; y < height; ++y) { - memcpy(&dst[y * 32], &ref_main[2 + y], 32 * sizeof(uvg_pixel)); + memcpy(&dst[y * 32], &ref_main[2 + y + multi_ref_index], 32 * sizeof(uvg_pixel)); } break; case 64: for (int y = 0; y < height; ++y) { - memcpy(&dst[y * 64], &ref_main[2 + y], 64 * sizeof(uvg_pixel)); + memcpy(&dst[y * 64], &ref_main[2 + y + multi_ref_index], 64 * sizeof(uvg_pixel)); } break; default: @@ -3564,7 +3564,7 @@ static void uvg_angular_pred_avx2( } - bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH); + bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH && multi_ref_index == 0); if (pred_mode > 1 && pred_mode < 67) { // Disable PDPC filter if both references are used or if MRL is used if (mode_disp < 0 || multi_ref_index) { diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 40da82c9..bd498bf9 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3,7 +3,10 @@ #include "global.h" - +// Used for calculating table offsets for each prediction mode. +// If row length changes, this value must be updated. +// Remember to use values divisible by 16 to ensure tables stay aligned to 32 bytes. +#define DELTA_TABLE_ROW_LENGTH 80 // The number of unique 128-bit coefficient vectors for a given prediction mode. Applicable for width 4 chroma linear interpolation. static ALIGNED(32) const int8_t coeff_vector128_num_by_mode[33] = { @@ -2177,117 +2180,121 @@ ALIGNED(32) static const int8_t planar_avx2_ver_w8ys[2080] = { // Delta int and delta fract tables. Rows are prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_table[2112] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, // 2 Diagonal mode - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, - 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, - 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, // 6 - 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, - 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, - 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, // 10 - 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, - 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, - 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, // 14 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal and vertical mode - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, - -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, // 22 - -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, - -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, - -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, - -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, // 26 - -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, - -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, - -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, - -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, // 30 - -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, - -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, // 34 Diagonal mode +// Table extended to allow dimensions up to 80. This will ensure that the SIMD code will not read outside the table and the table is aligned to 32 bytes. +// This is done to prevent errors when multi ref line is enabled. In cases where MRL > 0, the table rows can be indexed with a value larger than 64. +// If the max dimensions change in the future, the table can be generated again with the new dimensions. The generation python script can be found in the speed bench repository. +// This is also true for the other delta tables. +ALIGNED(32) static const int16_t delta_int_table[2640] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, // 2 Diagonal mode + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, + 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, + 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 39, 40, 40, 41, 42, 43, 43, 44, 45, 46, 46, 47, 48, 48, 49, 50, 51, 51, 52, 53, 53, 54, 55, 56, 56, 57, + 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 35, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 41, 42, 43, 43, 44, 45, 45, 46, 46, 47, 48, 48, 49, 50, // 6 + 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, 41, 41, 42, 42, 43, 43, 44, 45, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, + 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 33, 33, 34, 34, 35, + 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 30, // 10 + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, // 14 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal or vertical mode + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, + -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -10, -10, -10, // 22 + -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -11, -11, -11, -11, -11, -12, -12, -12, -12, -12, -12, -13, -13, -13, -13, -13, -14, -14, -14, -14, -14, -15, -15, -15, -15, -15, -15, + -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -16, -17, -17, -17, -17, -18, -18, -18, -18, -19, -19, -19, -19, -20, -20, -20, -20, + -1, -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -4, -5, -5, -5, -5, -6, -6, -6, -7, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -19, -20, -20, -20, -20, -21, -21, -21, -22, -22, -22, -23, -23, -23, -24, -24, -24, -25, -25, -25, -25, + -1, -1, -2, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -6, -7, -7, -8, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -12, -13, -13, -14, -14, -14, -15, -15, -15, -16, -16, -17, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -21, -22, -22, -23, -23, -23, -24, -24, -24, -25, -25, -26, -26, -26, -27, -27, -27, -28, -28, -29, -29, -29, -30, -30, -30, // 26 + -1, -1, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -14, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -21, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -25, -26, -26, -27, -27, -28, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, -32, -33, -33, -34, -34, -35, -35, -35, + -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, -33, -33, -34, -34, -35, -35, -36, -36, -37, -37, -38, -38, -39, -39, -40, -40, + -1, -2, -2, -3, -3, -4, -4, -5, -6, -6, -7, -7, -8, -8, -9, -9, -10, -11, -11, -12, -12, -13, -13, -14, -15, -15, -16, -16, -17, -17, -18, -18, -19, -20, -20, -21, -21, -22, -22, -23, -24, -24, -25, -25, -26, -26, -27, -27, -28, -29, -29, -30, -30, -31, -31, -32, -33, -33, -34, -34, -35, -35, -36, -36, -37, -38, -38, -39, -39, -40, -40, -41, -42, -42, -43, -43, -44, -44, -45, -45, + -1, -2, -2, -3, -4, -4, -5, -5, -6, -7, -7, -8, -9, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -15, -16, -17, -17, -18, -19, -19, -20, -20, -21, -22, -22, -23, -24, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -30, -31, -32, -32, -33, -34, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -40, -41, -42, -42, -43, -44, -44, -45, -45, -46, -47, -47, -48, -49, -49, -50, -50, // 30 + -1, -2, -3, -3, -4, -5, -6, -6, -7, -8, -8, -9, -10, -11, -11, -12, -13, -13, -14, -15, -16, -16, -17, -18, -18, -19, -20, -21, -21, -22, -23, -23, -24, -25, -26, -26, -27, -28, -29, -29, -30, -31, -31, -32, -33, -34, -34, -35, -36, -36, -37, -38, -39, -39, -40, -41, -41, -42, -43, -44, -44, -45, -46, -46, -47, -48, -49, -49, -50, -51, -52, -52, -53, -54, -54, -55, -56, -57, -57, -58, + -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52, -53, -54, -55, -56, -57, -57, -58, -59, -60, -61, -61, -62, -63, -64, -65, -65, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -58, -59, -60, -61, -62, -63, -64, -65, -66, -67, -68, -68, -69, -70, -71, -72, -73, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, -65, -66, -67, -68, -69, -70, -71, -72, -73, -74, -75, -76, -77, -78, -79, -80, // 34 Diagonal mode }; -// TODO: cut this table to 32 width, the second 32 width half is identical to the first -ALIGNED(32) static const int16_t delta_fract_table[2112] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode -29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, -26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, -23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, -20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 -18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, -16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, -14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, -12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 -10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, - 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Horizontal & vertical mode -31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, -29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, -28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 -26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, -24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, -22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, -20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 -18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, -16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, -14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, -12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode +// OPTIONAL TODO: This table can be cut to 32 width, the second 32 width half (and all repeating 32 item chunks) is identical to the first. For easy access, leave the table as is, otherwise some modulo operations are necessary. +ALIGNED(32) static const int16_t delta_fract_table[2640] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 Diagonal mode +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 6 +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 10 +10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, 10, 20, 30, 8, 18, 28, 6, 16, 26, 4, 14, 24, 2, 12, 22, 0, + 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28, 0, // 14 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18 Pure horizontal or vertical mode +31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, +30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, +29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 31, 28, 25, 22, 19, 16, +28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, 28, 24, 20, 16, 12, 8, 4, 0, // 22 +26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, +24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, 24, 16, 8, 0, +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, +20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, 20, 8, 28, 16, 4, 24, 12, 0, // 26 +18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, 18, 4, 22, 8, 26, 12, 30, 16, 2, 20, 6, 24, 10, 28, 14, 0, +16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, +14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, 14, 28, 10, 24, 6, 20, 2, 16, 30, 12, 26, 8, 22, 4, 18, 0, +12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, 12, 24, 4, 16, 28, 8, 20, 0, // 30 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 34 Diagonal mode }; // Delta int and delta fract wide angle tables. Rows are corrected prediction mode, columns y offset. (or x offset for horizontal modes) -ALIGNED(32) static const int16_t delta_int_wide_angle_table[960] = { - 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, // 81 - 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, // -12 - 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, - 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, // -10 - 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, - 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, // -8 - 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, - 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, // -6 - 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, // -4 - 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, - 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, // -2 - 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, - 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, // 0 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, // 1 +ALIGNED(32) static const int16_t delta_int_wide_angle_table[1200] = { + 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1024, 1056, 1088, 1120, 1152, 1184, 1216, 1248, 1280, 1312, 1344, 1376, 1408, 1440, 1472, 1504, 1536, 1568, 1600, 1632, 1664, 1696, 1728, 1760, 1792, 1824, 1856, 1888, 1920, 1952, 1984, 2016, 2048, 2080, 2112, 2144, 2176, 2208, 2240, 2272, 2304, 2336, 2368, 2400, 2432, 2464, 2496, 2528, 2560, // -13 Non-fractional angle + 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, // -12 Non-fractional angle + 10, 21, 31, 42, 53, 63, 74, 85, 95, 106, 117, 127, 138, 149, 159, 170, 181, 191, 202, 213, 223, 234, 245, 255, 266, 277, 287, 298, 309, 319, 330, 341, 351, 362, 372, 383, 394, 404, 415, 426, 436, 447, 458, 468, 479, 490, 500, 511, 522, 532, 543, 554, 564, 575, 586, 596, 607, 618, 628, 639, 650, 660, 671, 682, 692, 703, 713, 724, 735, 745, 756, 767, 777, 788, 799, 809, 820, 831, 841, 852, // -11 + 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512, 520, 528, 536, 544, 552, 560, 568, 576, 584, 592, 600, 608, 616, 624, 632, 640, // -10 Non-fractional angle + 5, 10, 16, 21, 26, 32, 37, 42, 48, 53, 58, 64, 69, 74, 80, 85, 90, 96, 101, 106, 112, 117, 122, 128, 133, 138, 144, 149, 154, 160, 165, 171, 176, 181, 187, 192, 197, 203, 208, 213, 219, 224, 229, 235, 240, 245, 251, 256, 261, 267, 272, 277, 283, 288, 293, 299, 304, 309, 315, 320, 325, 331, 336, 342, 347, 352, 358, 363, 368, 374, 379, 384, 390, 395, 400, 406, 411, 416, 422, 427, // -9 + 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, // -8 Non-fractional angle + 3, 6, 9, 12, 15, 19, 22, 25, 28, 31, 35, 38, 41, 44, 47, 51, 54, 57, 60, 63, 66, 70, 73, 76, 79, 82, 86, 89, 92, 95, 98, 102, 105, 108, 111, 114, 117, 121, 124, 127, 130, 133, 137, 140, 143, 146, 149, 153, 156, 159, 162, 165, 168, 172, 175, 178, 181, 184, 188, 191, 194, 197, 200, 204, 207, 210, 213, 216, 219, 223, 226, 229, 232, 235, 239, 242, 245, 248, 251, 255, // -7 + 2, 5, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 34, 37, 40, 43, 45, 48, 51, 53, 56, 59, 61, 64, 67, 69, 72, 75, 77, 80, 83, 86, 88, 91, 94, 96, 99, 102, 104, 107, 110, 112, 115, 118, 120, 123, 126, 129, 131, 134, 137, 139, 142, 145, 147, 150, 153, 155, 158, 161, 163, 166, 169, 172, 174, 177, 180, 182, 185, 188, 190, 193, 196, 198, 201, 204, 206, 209, 212, 215, // -6 + 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 25, 27, 29, 31, 34, 36, 38, 41, 43, 45, 47, 50, 52, 54, 57, 59, 61, 63, 66, 68, 70, 73, 75, 77, 79, 82, 84, 86, 88, 91, 93, 95, 98, 100, 102, 104, 107, 109, 111, 114, 116, 118, 120, 123, 125, 127, 130, 132, 134, 136, 139, 141, 143, 146, 148, 150, 152, 155, 157, 159, 161, 164, 166, 168, 171, 173, 175, 177, 180, 182, // -5 + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, // -4 Non-fractional angle + 1, 3, 5, 7, 8, 10, 12, 14, 16, 17, 19, 21, 23, 24, 26, 28, 30, 32, 33, 35, 37, 39, 40, 42, 44, 46, 48, 49, 51, 53, 55, 57, 58, 60, 62, 64, 65, 67, 69, 71, 73, 74, 76, 78, 80, 81, 83, 85, 87, 89, 90, 92, 94, 96, 97, 99, 101, 103, 105, 106, 108, 110, 112, 114, 115, 117, 119, 121, 122, 124, 126, 128, 130, 131, 133, 135, 137, 138, 140, 142, // -3 + 1, 3, 4, 6, 7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 23, 25, 27, 28, 30, 31, 33, 35, 36, 38, 39, 41, 43, 44, 46, 47, 49, 51, 52, 54, 55, 57, 58, 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 79, 81, 82, 84, 86, 87, 89, 90, 92, 94, 95, 97, 98, 100, 102, 103, 105, 106, 108, 109, 111, 113, 114, 116, 117, 119, 121, 122, 124, 125, 127, // -2 + 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30, 32, 33, 35, 36, 37, 39, 40, 42, 43, 45, 46, 47, 49, 50, 52, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 73, 74, 75, 77, 78, 80, 81, 82, 84, 85, 87, 88, 90, 91, 92, 94, 95, 97, 98, 99, 101, 102, 104, 105, 106, 108, 109, 111, 112, // -1 + 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 90, 91, 92, 93, 95, 96, 97, // 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, // 1 }; -// TODO: Can be cut in half due to horizontal symmetry -ALIGNED(32) static const int16_t delta_fract_wide_angle_table[960] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 81 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 -21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 -11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -8 - 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, -22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // -6 - 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -4 -25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, -19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, // -2 -13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, - 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, // 0 - 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, // 1 +// OPTIONAL TODO: This table can be cut to 32 width, the second 32 width half (and all repeating 32 item chunks) is identical to the first. For easy access, leave the table as is, otherwise some modulo operations are necessary. +ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1200] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -13 Non-fractional angle + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12 Non-fractional angle +21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, // -11 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -10 Non-fractional angle +11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, // -9 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -8 Non-fractional angle + 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, // -7 +22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, 22, 12, 2, 24, 14, 4, 26, 16, 6, 28, 18, 8, 30, 20, 10, 0, // -6 + 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, // -5 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -4 Non-fractional angle +25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, 9, 2, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0, 25, 18, 11, 4, 29, 22, 15, 8, 1, 26, 19, 12, 5, 30, 23, 16, // -3 +19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, // -2 +13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, // -1 + 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, 26, 1, 8, 15, 22, 29, 4, 11, 18, 25, 0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, 13, 20, 27, 2, 9, 16, // 0 + 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, // 1 }; From 2ee4c39d60029ce647650c4da735432d09133338 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 4 Jun 2024 10:12:32 +0300 Subject: [PATCH 179/237] [avx2] fix bunch of alignment issues --- src/intra.c | 2 +- src/strategies/avx2/dct-avx2.c | 4 +-- src/strategies/avx2/depquant-avx2.c | 2 +- src/strategies/avx2/intra-avx2.c | 42 ++++++++++++++--------------- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/intra.c b/src/intra.c index 3fa00a26..39785747 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1498,7 +1498,7 @@ static void intra_recon_tb_leaf( uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode); - uvg_pixel pred[32 * 32]; + ALIGNED(32) uvg_pixel pred[32 * 32]; uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu); const int index = lcu_px.x + lcu_px.y * lcu_width; diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 01b1b4bb..45bc7891 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -3201,7 +3201,7 @@ static void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_ ver_coeff = ff_dct8_4x32_coeff_ver; } - int16_t v_hor_pass_out[4*32]; + ALIGNED(32) int16_t v_hor_pass_out[4*32]; fast_forward_tr_4xN_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); @@ -5637,7 +5637,7 @@ static void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type ver_coeff = ff_dct8_16x32_coeff_ver; } - int16_t v_hor_pass_out[32*16]; + ALIGNED(32) int16_t v_hor_pass_out[32*16]; fast_forward_DCT2_B16_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c index b393bce6..697ef8dd 100644 --- a/src/strategies/avx2/depquant-avx2.c +++ b/src/strategies/avx2/depquant-avx2.c @@ -647,7 +647,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, for (int i = 0; i < numSbb * 4; i += 32) { __m256i sbb_flags = _mm256_loadu_si256((__m256i*)(&cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i])); sbb_flags = _mm256_shuffle_epi8(sbb_flags, inc_ref_state); - _mm256_store_si256((__m256i*)&sbbFlags[i], sbb_flags); + _mm256_storeu_si256((__m256i*)&sbbFlags[i], sbb_flags); } } // The first 16 variables will be loaded from the previous state so this can be started from 16 diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 2ea0f4d7..44ee3c96 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1413,7 +1413,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w4_wide_angle_hor_avx2(uv ); //__m128i vidx = _mm_setr_epi32(delta_int[0], delta_int[1], delta_int[2], delta_int[3]); - __m128i vidx = _mm_load_si128((__m128i*)delta_int); + __m128i vidx = _mm_loadu_si128((__m128i*)delta_int); vidx = _mm_cvtepi16_epi32(vidx); // Handle as 4x4 blocks. There is no case where height < 4. @@ -1931,7 +1931,7 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vseq, 8); + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vseq, 8); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2068,7 +2068,7 @@ static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re } } - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2311,7 +2311,7 @@ static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re //__m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); //vleft = _mm_shuffle_epi8(vleft, vshuf); - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); //__m256i vleft16 = _mm256_cvtepu8_epi16(vleft); __m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2371,7 +2371,7 @@ static void angular_pdpc_ver_8x2_scale2_high_angle_avx2(uvg_pixel* dst, const uv __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); vleft = _mm_shuffle_epi8(vleft, vshuf); - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); //__m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2427,7 +2427,7 @@ static void angular_pdpc_ver_8x2_scale1_high_angle_avx2(uvg_pixel* dst, const uv } } - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft = _mm256_loadu_si256((__m256i*)left); @@ -2673,7 +2673,7 @@ static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, memcpy(&tmp[0], &ref_side[shifted_inv_angle_sum[y + 0] + 1], 8 * sizeof(uvg_pixel)); memcpy(&tmp[8], &ref_side[shifted_inv_angle_sum[y + 1] + 1], 8 * sizeof(uvg_pixel)); - __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vpred = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); __m128i vtop = _mm_load_si128((__m128i*)tmp); __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); @@ -2855,7 +2855,7 @@ static void angular_pdpc_mode18_w8_avx2(uvg_pixel* dst, const uvg_pixel top_left for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { const __m256i vwT = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_hor_weight[o]); - __m128i vpred = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vpred = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); __m256i accu = _mm256_sub_epi16(vref16, vtopleft); @@ -3210,7 +3210,7 @@ static void angular_pdpc_mode50_scale1_avx2(uvg_pixel* dst, const uvg_pixel top_ __m256i vref16 = _mm256_cvtepu8_epi16(vref); //__m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); - __m128i vdst = _mm_i64gather_epi64((const int64_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i accu = _mm256_sub_epi16(vref16, vtopleft); @@ -5583,7 +5583,7 @@ static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg __m256i vbefore = _mm256_load_si256((__m256i*)before); - __m256i vbehind = _mm256_load_si256((__m256i*)src_ptr); + __m256i vbehind = _mm256_loadu_si256((__m256i*)src_ptr); // Permute the input values to get the result in correct order. vbefore = _mm256_permutevar8x32_epi32(vbefore, permute_mask); @@ -5962,8 +5962,8 @@ static void mip_upsampling_w4_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixe vres0 = _mm256_permute4x64_epi64(vlo128, _MM_SHUFFLE(3, 1, 2, 0)); vres1 = _mm256_permute4x64_epi64(vhi128, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_store_si256((__m256i*)(dst + 0), vres0); - _mm256_store_si256((__m256i*)(dst + 32), vres1); + _mm256_storeu_si256((__m256i*)(dst + 0), vres0); + _mm256_storeu_si256((__m256i*)(dst + 32), vres1); } @@ -6092,7 +6092,7 @@ static void mip_upsampling_w8_ups2_h16_ver_avx2(uvg_pixel* const dst, const uvg_ { int64_t refline = *(int64_t*)ref; __m128i vbehind0 = _mm_load_si128((__m128i*)(src + 0)); - __m128i vbefore1 = _mm_load_si128((__m128i*)(src + 8)); + __m128i vbefore1 = _mm_loadu_si128((__m128i*)(src + 8)); __m128i vbehind1 = _mm_load_si128((__m128i*)(src + 16)); __m128i vbefore0 = vbehind0; @@ -6112,9 +6112,9 @@ static void mip_upsampling_w8_ups2_h16_ver_avx2(uvg_pixel* const dst, const uvg_ _mm_store_si128((__m128i*)(dst + 32), vres2); _mm_store_si128((__m128i*)(dst + 48), vres3); - vbefore0 = _mm_load_si128((__m128i*)(src + 24)); + vbefore0 = _mm_loadu_si128((__m128i*)(src + 24)); vbehind0 = _mm_load_si128((__m128i*)(src + 32)); - vbefore1 = _mm_load_si128((__m128i*)(src + 40)); + vbefore1 = _mm_loadu_si128((__m128i*)(src + 40)); vbehind1 = _mm_load_si128((__m128i*)(src + 48)); vavg0 = _mm_avg_epu8(vbefore0, vbehind0); @@ -6360,7 +6360,7 @@ static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix __m256i vbefore256; __m256i vbehind256; - __m128i vbefore = _mm_load_si128((__m128i*)ref); + __m128i vbefore = _mm_loadu_si128((__m128i*)ref); vbefore256 = _mm256_cvtepu8_epi16(vbefore); for (int i = 0; i < 8; ++i) { @@ -6388,8 +6388,8 @@ static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pix vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_store_si256((__m256i*)(dst + (i * 64) + 0), vres0); - _mm256_store_si256((__m256i*)(dst + (i * 64) + 32), vres1); + _mm256_storeu_si256((__m256i*)(dst + (i * 64) + 0), vres0); + _mm256_storeu_si256((__m256i*)(dst + (i * 64) + 32), vres1); vbefore256 = vbehind256; } @@ -6587,13 +6587,13 @@ static void mip_upsampling_w16_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { - __m256i vbefore = _mm256_load_si256((__m256i*)ref); + __m256i vbefore = _mm256_loadu_si256((__m256i*)ref); for (int i = 0; i < 8; ++i) { - __m256i vbehind = _mm256_load_si256((__m256i*)(src + (i * 64))); + __m256i vbehind = _mm256_loadu_si256((__m256i*)(src + (i * 64))); __m256i vavg = _mm256_avg_epu8(vbefore, vbehind); - _mm256_store_si256((__m256i*)(dst + (i * 64)), vavg); + _mm256_storeu_si256((__m256i*)(dst + (i * 64)), vavg); vbefore = vbehind; } From 184c11b956621dc337df8cd89abe50d8ae1111db Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 7 Jun 2024 22:12:59 +0300 Subject: [PATCH 180/237] Fix couple of issues --- src/strategies/avx2/dct-avx2.c | 2 +- src/strategies/avx2/intra-avx2.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 45bc7891..2fb47b45 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -5949,7 +5949,7 @@ static void fast_forward_DCT2_B32_avx2_hor(const int16_t* src, __m256i* dst, con v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); - if(line == 32) { + if(line == 32 || line == 1) { v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); } diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 44ee3c96..7a8ca86a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1347,7 +1347,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pix // Handle as 4x4 blocks. There is no case where height < 4. if (height == 4) { // Offset indices by one since index 0 is top left and plus one since delta_int[0] for mode 2 is 1. - __m128i vref = _mm_loadu_si128((__m128i*)&ref[2] + multi_ref_offset); + __m128i vref = _mm_loadu_si128((__m128i*)(&ref[2] + multi_ref_offset)); vref = _mm_shuffle_epi8(vref, vrefshuf0); _mm_store_si128((__m128i*)dst, vref); @@ -3639,7 +3639,7 @@ static void uvg_angular_pred_avx2( angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); } else { - if (mode_disp < 10) + if (mode_disp < 10 || 1) angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); else angular_pdpc_ver_8x2_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); From e71bd9864afdbbb1fd23e2681e73307198e3f803 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 10 Jun 2024 12:01:39 +0300 Subject: [PATCH 181/237] Fix angular_pdpc_ver_8x2_scale2 function. Was using wrong hard coded scale value. --- src/strategies/avx2/intra-avx2.c | 4 ++-- src/strategies/generic/dct-generic.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 7a8ca86a..6a61ba31 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2343,7 +2343,7 @@ static void angular_pdpc_ver_8x2_scale2_high_angle_avx2(uvg_pixel* dst, const uv // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. - const int scale = 1; + const int scale = 2; const int log2_width = uvg_g_convert_to_log2[width]; const int limit = 6; @@ -3639,7 +3639,7 @@ static void uvg_angular_pred_avx2( angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); } else { - if (mode_disp < 10 || 1) + if (mode_disp < 10) angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); else angular_pdpc_ver_8x2_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c index ccddf17a..3b9f1d43 100644 --- a/src/strategies/generic/dct-generic.c +++ b/src/strategies/generic/dct-generic.c @@ -2608,6 +2608,7 @@ static void mts_dct_generic( if (height == 1) { dct_hor(input, output, shift_1st, height, 0, skip_width); } else if (width == 1) { + // The shift value is taken from VTM, it's a special case for width == 1 dct_ver(input, output, log2_height_minus1 + 1 + bitdepth + 6 - 15, width, 0, skip_height); } else { dct_hor(input, tmp, shift_1st, height, 0, skip_width); From f2d5c3abf055b4c98b5139b85f40edf2b4e521bc Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 11 Jun 2024 13:25:42 +0300 Subject: [PATCH 182/237] [dep_quant] Fix bug --- src/dep_quant.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dep_quant.c b/src/dep_quant.c index 16591390..6d6f5ad5 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -579,7 +579,7 @@ static INLINE void update_common_context( sbbFlags[i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb_state]; } for (int i = 16; i < setCpSize; ++i) { - levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[scan_pos * 4 + i * 4 + prev_sbb_state]; + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels[scan_pos * 4 + i * 4 + prev_sbb_state]; } } else { From 809c5c3ff6a59167c4dd2e83bc78a3bba7b5baaa Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 23 Jul 2024 12:27:57 +0300 Subject: [PATCH 183/237] [dep_quant] Fix reading uninitialized coefficients --- src/dep_quant.c | 16 +++--- src/dep_quant.h | 1 - src/encode_coding_tree.c | 4 +- src/rdo.c | 8 +-- src/search_intra.c | 4 +- src/strategies/avx2/quant-avx2.c | 2 +- .../generic/encode_coding_tree-generic.c | 4 +- src/strategies/generic/quant-generic.c | 2 +- src/tables.c | 53 ++++++++++++++++++- src/tables.h | 2 +- src/transform.c | 2 +- 11 files changed, 74 insertions(+), 24 deletions(-) diff --git a/src/dep_quant.c b/src/dep_quant.c index 6d6f5ad5..8df790c4 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -80,8 +80,8 @@ int uvg_init_nb_info(encoder_control_t * encoder) { const int scanType = SCAN_DIAG; const uint32_t blkWidthIdx = hd; const uint32_t blkHeightIdx = vd; - const uint32_t* scanId2RP = uvg_get_scan_order_table(SCAN_GROUP_4X4, scanType, blkWidthIdx, blkHeightIdx); - const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, 0, hd, vd); + const uint32_t* scanId2RP = uvg_get_scan_order_table(SCAN_GROUP_4X4, scanType, blkWidthIdx, blkHeightIdx, 0); + const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, 0, hd, vd, 0); NbInfoSbb** sId2NbSbb = &encoder->m_scanId2NbInfoSbbArray[hd][vd]; NbInfoOut** sId2NbOut = &encoder->m_scanId2NbInfoOutArray[hd][vd]; // consider only non-zero-out region @@ -329,7 +329,6 @@ static void reset_common_context(common_context* ctx, const rate_estimator_t * r } ctx->m_curr_sbb_ctx_offset = 0; ctx->m_prev_sbb_ctx_offset = 1; - ctx->num_coeff = num_coeff; } static void init_rate_esimator(rate_estimator_t * rate_estimator, const cabac_data_t * const ctx, color_t color) @@ -862,7 +861,6 @@ int uvg_dep_quant( cur_tu->lfnst_idx : cur_tu->cr_lfnst_idx; - const int numCoeff = width * height; memset(coeff_out, 0x00, width * height * sizeof(coeff_t)); *absSum = 0; @@ -872,8 +870,9 @@ int uvg_dep_quant( const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; - const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height); - const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED,0,log2_tr_width,log2_tr_height); + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height, 0); + const uint32_t* const scan2 = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height, is_mts); + const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED,0,log2_tr_width,log2_tr_height, 0); int32_t qp_scaled = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); qp_scaled = is_ts ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; @@ -913,6 +912,7 @@ int uvg_dep_quant( effWidth = (width == 32) ? 16 : width; zeroOut = (effHeight < height || effWidth < width); } + const int numCoeff = effWidth * effHeight; zeroOutforThres = zeroOut || (32 < height || 32 < width); //===== find first test position ===== int firstTestPos = numCoeff - 1; @@ -925,7 +925,7 @@ int uvg_dep_quant( srcCoeff, enableScalingLists, &dep_quant_context, - scan, + scan2, q_coeff, &firstTestPos, width, @@ -1080,7 +1080,7 @@ void uvg_dep_quant_dequant( const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; - const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, 0, log2_tr_width, log2_tr_height); + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, 0, log2_tr_width, log2_tr_height, 0); bool needs_block_size_trafo_scale =((log2_tr_height + log2_tr_width) % 2 == 1); needs_block_size_trafo_scale |= 0; // Non log2 block size diff --git a/src/dep_quant.h b/src/dep_quant.h index 6ef54f4d..cae2da21 100644 --- a/src/dep_quant.h +++ b/src/dep_quant.h @@ -124,7 +124,6 @@ typedef struct { int m_prev_sbb_ctx_offset; uint8_t sbb_memory[8 * 1024]; uint8_t level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH]; - int num_coeff; } common_context; diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 858d89f4..615c3d5d 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -239,8 +239,8 @@ void uvg_encode_ts_residual(encoder_state_t* const state, // TODO: log2_cg_size is wrong if width != height const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; - const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); - const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height, 0); + const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height, 0); double bits = 0; diff --git a/src/rdo.c b/src/rdo.c index c5d1c71b..7ce8e775 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1215,8 +1215,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_ const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1; - const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); - const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height, 0); + const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height, 0); uint32_t coeff_levels[3]; double coeff_level_error[4]; @@ -1501,8 +1501,8 @@ void uvg_rdoq( const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width); const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height); - const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); - const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); + const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height, 0); + const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height, 0); const uint32_t cg_size = 16; const int32_t shift = 4 >> 1; diff --git a/src/search_intra.c b/src/search_intra.c index a644ed9c..fe21533c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -216,8 +216,8 @@ static void derive_mts_constraints(cu_info_t *const pred_cu, const uint32_t log2_block_height = uvg_g_convert_to_log2[height]; const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; - const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height); - const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height); + const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height, 0); + const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height, 0); coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH]; uvg_get_sub_coeff(coeff_y, lcu->coeff.y, lcu_px.x, lcu_px.y, width, height, LCU_WIDTH); diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index cada96f1..41b36f05 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -382,7 +382,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr const encoder_control_t * const encoder = state->encoder_control; const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; - const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height); + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height, 0); int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c index c3065903..7e10c5ec 100644 --- a/src/strategies/generic/encode_coding_tree-generic.c +++ b/src/strategies/generic/encode_coding_tree-generic.c @@ -84,8 +84,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, const uint8_t log2_block_height = uvg_g_convert_to_log2[height]; const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; - const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); - const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height, 0); + const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height, 0); // Init base contexts according to block type diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index e39b6c52..51b2fe7c 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -63,7 +63,7 @@ void uvg_quant_generic( const encoder_control_t * const encoder = state->encoder_control; const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; - const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height); + const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height, 0); int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; diff --git a/src/tables.c b/src/tables.c index c98ecf79..fb3412d9 100644 --- a/src/tables.c +++ b/src/tables.c @@ -2596,6 +2596,43 @@ static const uint32_t* const g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_ }; +static const uint32_t mts_reduced_16x4[16 * 4] = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 97, 66, 35, 98, 67, 99, + 4, 36, 5, 68, 37, 6, 100, 69, 38, 7, 101, 70, 39, 102, 71, 103, + 8, 40, 9, 72, 41, 10, 104, 73, 42, 11, 105, 74, 43, 106, 75, 107, + 12, 44, 13, 76, 45, 14, 108, 77, 46, 15, 109, 78, 47, 110, 79, 111, +}; +static const uint32_t mts_reduced_16x8[16*8] = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 97, 66, 35, 98, 67, 99, + 128, 160, 129, 192, 161, 130, 224, 193, 162, 131, 225, 194, 163, 226, 195, 227, + 4, 36, 5, 68, 37, 6, 100, 69, 38, 7, 101, 70, 39, 102, 71, 103, + 132, 164, 133, 196, 165, 134, 228, 197, 166, 135, 229, 198, 167, 230, 199, 231, + 8, 40, 9, 72, 41, 10, 104, 73, 42, 11, 105, 74, 43, 106, 75, 107, + 136, 168, 137, 200, 169, 138, 232, 201, 170, 139, 233, 202, 171, 234, 203, 235, + 12, 44, 13, 76, 45, 14, 108, 77, 46, 15, 109, 78, 47, 110, 79, 111, + 140, 172, 141, 204, 173, 142, 236, 205, 174, 143, 237, 206, 175, 238, 207, 239, +}; +static const uint32_t mts_reduced_16x16[16 * 16] = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 97, 66, 35, 98, 67, + 99, 128, 160, 129, 192, 161, 130, 224, 193, 162, 131, 225, 194, 163, 226, + 195, 227, 4, 36, 5, 68, 37, 6, 100, 69, 38, 7, 101, 70, 39, + 102, 71, 103, 256, 288, 257, 320, 289, 258, 352, 321, 290, 259, 353, 322, + 291, 354, 323, 355, 132, 164, 133, 196, 165, 134, 228, 197, 166, 135, 229, + 198, 167, 230, 199, 231, 8, 40, 9, 72, 41, 10, 104, 73, 42, 11, + 105, 74, 43, 106, 75, 107, 384, 416, 385, 448, 417, 386, 480, 449, 418, + 387, 481, 450, 419, 482, 451, 483, 260, 292, 261, 324, 293, 262, 356, 325, + 294, 263, 357, 326, 295, 358, 327, 359, 136, 168, 137, 200, 169, 138, 232, + 201, 170, 139, 233, 202, 171, 234, 203, 235, 12, 44, 13, 76, 45, 14, + 108, 77, 46, 15, 109, 78, 47, 110, 79, 111, 388, 420, 389, 452, 421, + 390, 484, 453, 422, 391, 485, 454, 423, 486, 455, 487, 264, 296, 265, 328, + 297, 266, 360, 329, 298, 267, 361, 330, 299, 362, 331, 363, 140, 172, 141, + 204, 173, 142, 236, 205, 174, 143, 237, 206, 175, 238, 207, 239, 392, 424, + 393, 456, 425, 394, 488, 457, 426, 395, 489, 458, 427, 490, 459, 491, 268, + 300, 269, 332, 301, 270, 364, 333, 302, 271, 365, 334, 303, 366, 335, 367, + 396, 428, 397, 460, 429, 398, 492, 461, 430, 399, 493, 462, 431, 494, 463, + 495, +}; + /** * \brief Return array of scan order indices. * @@ -2606,10 +2643,24 @@ static const uint32_t* const g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_ * * \return Returns pointer to scan order table based on given dimensions. */ -const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h) +const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h, bool mts_reduced) { // TODO: horizontal and vertical scan types assert(scan_type == SCAN_DIAG && "Horizontal and vertical scan not implemented."); + if (mts_reduced && log2_w == 5 && scan_group == SCAN_GROUP_4X4) { + if (log2_h == 2) { + return mts_reduced_16x4; + } + if (log2_h == 3) { + return mts_reduced_16x8; + } + if (log2_h >= 4) { + return mts_reduced_16x16; + } + } + if (mts_reduced && log2_h == 5 && scan_group == SCAN_GROUP_4X4) { + log2_h = 4; + } if (scan_group == SCAN_GROUP_4X4) { return g_scan_order[scan_group][log2_w][log2_h]; diff --git a/src/tables.h b/src/tables.h index 44621251..ade9bfa3 100644 --- a/src/tables.h +++ b/src/tables.h @@ -143,6 +143,6 @@ extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2]; #define SCAN_GROUP_UNGROUPED 0 #define SCAN_GROUP_4X4 1 -const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h); +const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h, bool mts_reduced); #endif //TABLES_H_ diff --git a/src/transform.c b/src/transform.c index 98728da0..47445d1a 100644 --- a/src/transform.c +++ b/src/transform.c @@ -188,7 +188,7 @@ void uvg_derive_lfnst_constraints( const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; - const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height); + const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height, 0); signed scan_pos_last = -1; coeff_t temp[TR_MAX_WIDTH * TR_MAX_WIDTH]; From e38e7bc916f12038895ba86c984a3754769503e4 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 23 Jul 2024 14:23:30 +0300 Subject: [PATCH 184/237] [dep_quant] Fix uninitialized read that did not really matter --- src/dep_quant.c | 2 +- src/dep_quant.h | 1 + src/strategies/avx2/depquant-avx2.c | 1 + src/strategies/generic/depquant-generic.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dep_quant.c b/src/dep_quant.c index 8df790c4..973860dd 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -1056,7 +1056,7 @@ int uvg_dep_quant( for (; prev_id >= 0; scanIdx++) { Decision temp = dep_quant_context.m_trellis[scanIdx]; int32_t blkpos = scan[scanIdx]; - coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -temp.absLevel[prev_id] : temp.absLevel[prev_id]); + coeff_out[blkpos] = temp.zero_out ? 0 : (srcCoeff[blkpos] < 0 ? -temp.absLevel[prev_id] : temp.absLevel[prev_id]); *absSum += temp.absLevel[prev_id]; prev_id = temp.prevId[prev_id]; } diff --git a/src/dep_quant.h b/src/dep_quant.h index cae2da21..b3026b96 100644 --- a/src/dep_quant.h +++ b/src/dep_quant.h @@ -108,6 +108,7 @@ typedef struct { int64_t ALIGNED(32) rdCost[8]; int32_t ALIGNED(32) absLevel[8]; int32_t ALIGNED(32) prevId[8]; + uint8_t zero_out; } Decision; diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c index 697ef8dd..6fec385d 100644 --- a/src/strategies/avx2/depquant-avx2.c +++ b/src/strategies/avx2/depquant-avx2.c @@ -1448,6 +1448,7 @@ void uvg_dep_quant_decide_and_update_avx2( } xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); + decisions->zero_out = zeroOut; if (scan_pos) { if (!(scan_pos & 15)) { diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c index b15ef52b..3271badb 100644 --- a/src/strategies/generic/depquant-generic.c +++ b/src/strategies/generic/depquant-generic.c @@ -202,6 +202,7 @@ static void uvg_dep_quant_decide_and_update_generic( } xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); + decisions->zero_out = zeroOut; if (scan_pos) { if (!(scan_pos & 15)) { From 6e60a931a4a9f3c355b06d1610ea770e1c9523e3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Sat, 29 Jun 2024 15:36:42 +0300 Subject: [PATCH 185/237] Replace gather with loadu. Data was sequential, no need to gather. --- src/strategies/avx2/intra-avx2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 6a61ba31..1c6e68a9 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2137,7 +2137,7 @@ static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* int limit = MIN(3 << scale, width); - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + //__m128i vseq = _mm_setr_epi32(0, 1, 2, 3); //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2 width __m256i v32s = _mm256_set1_epi16(32); @@ -2160,7 +2160,8 @@ static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* __m128i vleft = _mm_loadu_si128((__m128i*)&ref_side[y + shifted_inv_angle_sum[0] + 1]); vleft = _mm_shuffle_epi8(vleft, vshuf); - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); + //__m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); + __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); @@ -3296,6 +3297,8 @@ static void uvg_angular_pred_avx2( // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. if (sample_disp < 0) { + // In cases where sample_disp is negative, references are needed from both sides. + // This step combines the main and side reference. memcpy(&temp_main[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel)); memcpy(&temp_side[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel)); From 0138e665892289b6d90bfebd47795436f8d4aba8 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 1 Aug 2024 14:36:59 +0300 Subject: [PATCH 186/237] Rename pdpc functions. High angle functions were actually handling low angles, now named properly. --- src/strategies/avx2/intra-avx2.c | 52 ++++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1c6e68a9..043605bd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1861,7 +1861,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w32_wide_angle_hor_avx2(u } -static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; int16_t left[4][4]; @@ -1906,7 +1906,7 @@ static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } -static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_w8_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 8; @@ -1950,7 +1950,7 @@ static void angular_pdpc_ver_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } -static void angular_pdpc_ver_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_w16_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m256i v32s = _mm256_set1_epi16(32); const int scale = 2; // Other functions handle scales 0 and 1 @@ -2087,7 +2087,7 @@ static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re } } -static void angular_pdpc_ver_w16_scale2_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_w16_scale2_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m256i v32s = _mm256_set1_epi16(32); const int scale = 2; // Other functions handle scales 0 and 1 @@ -2130,7 +2130,7 @@ static void angular_pdpc_ver_w16_scale2_high_angle_avx2(uvg_pixel* dst, const uv } -static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; int16_t left[4][4]; @@ -2179,7 +2179,7 @@ static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } } -static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_4x4_scale0_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. @@ -2229,7 +2229,7 @@ static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re } } -static void angular_pdpc_ver_4x4_scale0_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. @@ -2280,7 +2280,7 @@ static void angular_pdpc_ver_4x4_scale0_high_angle_avx2(uvg_pixel* dst, const uv } -static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_8x2_scale1_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. @@ -2339,7 +2339,7 @@ static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re } } -static void angular_pdpc_ver_8x2_scale2_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_8x2_scale2_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. @@ -2399,7 +2399,7 @@ static void angular_pdpc_ver_8x2_scale2_high_angle_avx2(uvg_pixel* dst, const uv } } -static void angular_pdpc_ver_8x2_scale1_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. @@ -3621,31 +3621,31 @@ static void uvg_angular_pred_avx2( // Note: no need to check for negative mode_disp, since it is already checked before. switch (width) { case 4: - // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. - // Handle high angles with more tight reference spacing with separate functions with more optimized loads. + // Low mode disp -> high angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. + // Handle low angles with more tight reference spacing with separate functions with more optimized loads. if (mode_disp < 6) - angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); - else angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + else + angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); break; case 8: if (scale == 0) { if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); - else angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); } else if (scale == 1) { if (mode_disp < 8) - angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); - else angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); } else { if (mode_disp < 10) - angular_pdpc_ver_w8_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_w8_high_angle_avx2(dst, ref_side, height, scale, mode_disp); else - angular_pdpc_ver_8x2_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x2_scale2_avx2(dst, ref_side, width, height, mode_disp); } break; case 16: // 16 width and higher done with the same functions @@ -3654,21 +3654,21 @@ static void uvg_angular_pred_avx2( switch (scale) { case 0: if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); - else angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); break; case 1: if (mode_disp < 8) - angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); - else angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); + else + angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); break; case 2: if (mode_disp < 14) - angular_pdpc_ver_w16_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_w16_high_angle_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_w16_scale2_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_w16_scale2_avx2(dst, ref_side, width, height, mode_disp); break; default: assert(false && "Intra PDPC: Invalid scale.\n"); From cf9f4037396df0d235e794269396134c7baab816 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 1 Aug 2024 15:22:21 +0300 Subject: [PATCH 187/237] Implement improved PDPC w4. --- src/strategies/avx2/intra-avx2.c | 56 ++++++++++++++++++++++++- src/strategies/avx2/intra_avx2_tables.h | 9 ++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 043605bd..15d4296a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2779,6 +2779,60 @@ static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } +// Improved PDPC functions. These use the streamlined PDPC equation + +// Mode 18 + + +// Mode 50 + + +// Other modes + +static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + const int width = 4; + __m128i v32s = _mm_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w4_ver_improved_weight[offset]); + const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_w4_ver[shuf_offset]); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + __m128i vleft = _mm_loadu_si128((__m128i*) &ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + //__m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); + __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + _mm_store_si128((__m128i*)(dst + (y * width)), packed); + } +} + + + + + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) { @@ -3626,7 +3680,7 @@ static void uvg_angular_pred_avx2( if (mode_disp < 6) angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); else - angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_w4_improved_avx2(dst, ref_side, height, scale, mode_disp); break; case 8: if (scale == 0) { diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index bd498bf9..1acbaa9b 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1334,6 +1334,15 @@ static ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { 32, 32, 16, 16, 8, 8, 4, 4, 2, 2, 1, 1, 0, 0, 0, 0, // Scale 2 }; +// Weights for improved PDPC + +// Weights for intra pdpc w4 vertical. +ALIGNED(32) const uint8_t intra_pdpc_w4_ver_improved_weight[] = { + 32, 32, 56, 8, 62, 2, 64, 0, 32, 32, 56, 8, 62, 2, 64, 0, // Scale 0 + 32, 32, 48, 16, 56, 8, 60, 4, 32, 32, 48, 16, 56, 8, 60, 4, // Scale 1 + 32, 32, 32, 32, 48, 16, 48, 16, 32, 32, 32, 32, 48, 16, 48, 16, // Scale 2 +}; + // Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. // Index by y or x based on pdpc direction. From e1b556170cf882b6996bc179f6e7f3f17cb3afae Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 2 Aug 2024 14:23:45 +0300 Subject: [PATCH 188/237] Implement high angle version with improved memory access. --- src/strategies/avx2/intra-avx2.c | 50 +++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 15d4296a..1295aeea 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2789,6 +2789,51 @@ static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* // Other modes +static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + const int width = 4; + ALIGNED(32) uint8_t left[4][4]; + __m128i v32s = _mm_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w4_ver_improved_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + const __m128i vleftshuf = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + for (int xx = 0; xx < width; ++xx) { + memcpy(left[xx], &ref_side[(y + 0) + shifted_inv_angle_sum[xx] + 1], 4 * sizeof(uint8_t)); + } + + __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); + __m128i vleft = _mm_load_si128((__m128i*)left); + vleft = _mm_shuffle_epi8(vleft, vleftshuf); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + _mm_store_si128((__m128i*)(dst + (y * width)), packed); + } +} + static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -2830,9 +2875,6 @@ static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } - - - // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) { @@ -3678,7 +3720,7 @@ static void uvg_angular_pred_avx2( // Low mode disp -> high angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. // Handle low angles with more tight reference spacing with separate functions with more optimized loads. if (mode_disp < 6) - angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_w4_high_angle_improved_avx2(dst, ref_side, height, scale, mode_disp); else angular_pdpc_ver_w4_improved_avx2(dst, ref_side, height, scale, mode_disp); break; From c39260218cdf673966398384cf2470900ad00aa4 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 2 Aug 2024 14:41:28 +0300 Subject: [PATCH 189/237] Further improve w4 high angle vertical PDPC. Gather removes the memory bottleneck completely. Huge speed increase. --- src/strategies/avx2/intra-avx2.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1295aeea..067d6555 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2792,7 +2792,7 @@ static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; - ALIGNED(32) uint8_t left[4][4]; + //ALIGNED(32) uint8_t left[4][4]; __m128i v32s = _mm_set1_epi16(32); // Scale can be 0, 1 or 2 @@ -2804,16 +2804,21 @@ static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const u const __m128i vleftshuf = _mm_setr_epi8( 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, - 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f); + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + + __m128i vidx = _mm_setr_epi32(shifted_inv_angle_sum[0], shifted_inv_angle_sum[1], + shifted_inv_angle_sum[2], shifted_inv_angle_sum[3]); // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { - for (int xx = 0; xx < width; ++xx) { + /*for (int xx = 0; xx < width; ++xx) { memcpy(left[xx], &ref_side[(y + 0) + shifted_inv_angle_sum[xx] + 1], 4 * sizeof(uint8_t)); - } + }*/ __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); - __m128i vleft = _mm_load_si128((__m128i*)left); + //__m128i vleft = _mm_load_si128((__m128i*)left); + __m128i vleft = _mm_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidx, 1); vleft = _mm_shuffle_epi8(vleft, vleftshuf); __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); From f4375d0d7324d65cf339c441454ad4c4c314f9b2 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 2 Aug 2024 16:15:50 +0300 Subject: [PATCH 190/237] Improve ver pdpc 4x4 scale0 high angle. --- src/strategies/avx2/intra-avx2.c | 55 +++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 067d6555..9a1b3bf2 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2879,6 +2879,59 @@ static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } } +static void angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. + // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. + const int scale = 0; + const int log2_width = uvg_g_convert_to_log2[width]; + __m128i v32s = _mm_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const __m128i vweight = _mm_load_si128((const __m128i*) & intra_pdpc_w4_ver_improved_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + + __m128i vidx_left = _mm_setr_epi32(shifted_inv_angle_sum[0], shifted_inv_angle_sum[1], + shifted_inv_angle_sum[2], shifted_inv_angle_sum[3]); + + const __m128i vleftshuf = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m128i vleft = _mm_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidx_left, 1); + vleft = _mm_shuffle_epi8(vleft, vleftshuf); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(packed, 0); + *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(packed, 1); + *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(packed, 2); + *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(packed, 3); + } +} + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) @@ -3732,7 +3785,7 @@ static void uvg_angular_pred_avx2( case 8: if (scale == 0) { if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); } From 78e759843cd6c9266b6a8fb1e155377d008a13f2 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 6 Aug 2024 14:23:16 +0300 Subject: [PATCH 191/237] Improve vertical 4x4 scale 0 pdpc. --- src/strategies/avx2/intra-avx2.c | 54 ++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 9a1b3bf2..f2ccee7d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2255,7 +2255,7 @@ static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { - __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + __m128i vleft = _mm_loadu_si128((__m128i*) &ref_side[y + shifted_inv_angle_sum[0] + 1]); vleft = _mm_shuffle_epi8(vleft, vshuf); __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); @@ -2932,6 +2932,56 @@ static void angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(uvg_pixel* dst, } } +static void angular_pdpc_ver_4x4_scale0_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. + // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. + // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. + const int scale = 0; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 3; + + __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m128i v32s = _mm_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w4_ver_improved_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_w4_ver[shuf_offset]); + + // For a 4 width block, height must be at least 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(packed, 0); + *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(packed, 1); + *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(packed, 2); + *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(packed, 3); + } +} + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) @@ -3787,7 +3837,7 @@ static void uvg_angular_pred_avx2( if (mode_disp < 6) angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_improved_avx2(dst, ref_side, width, height, mode_disp); } else if (scale == 1) { if (mode_disp < 8) From ada9c9cf5dac7d5cbeefc54a4380951364df4ef1 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 6 Aug 2024 17:38:21 +0300 Subject: [PATCH 192/237] Improve vertical 8x2 scale 1 high angle PDPC. Renamed to 8x4 as it now handles 4 rows at a time. --- src/strategies/avx2/intra-avx2.c | 81 ++++++++++++++++++++++++- src/strategies/avx2/intra_avx2_tables.h | 10 ++- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f2ccee7d..095b8c43 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2879,6 +2879,7 @@ static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } } + static void angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. @@ -2983,6 +2984,84 @@ static void angular_pdpc_ver_4x4_scale0_improved_avx2(uvg_pixel* dst, const uvg_ } +static void angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + const int scale = 1; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m256i vseq = _mm256_setr_epi64x(0, 1, 2, 3); + __m256i vidx = _mm256_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 32; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_ver_improved_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + //const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_8x2_scale1_ver[shuf_offset]); + + __m256i vidxleft = _mm256_setr_epi32(shifted_inv_angle_sum[0], shifted_inv_angle_sum[1], + shifted_inv_angle_sum[2], shifted_inv_angle_sum[3], + shifted_inv_angle_sum[4], shifted_inv_angle_sum[5], + shifted_inv_angle_sum[6], shifted_inv_angle_sum[7]); // These two are not needed. + + const __m256i vtranspose0 = _mm256_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f, + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + + const __m256i vtranspose1 = _mm256_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f + ); + + // For width 8, height must be at least 4 as PDPC is not done when height < 4. Handle 4 lines at once, this enables us to use gather for ref pixels. + for (int y = 0; y < height; y += 4) { + __m256i vdst = _mm256_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); + __m256i vleft = _mm256_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidxleft, 1); + + // Transpose vleft + vleft = _mm256_shuffle_epi8(vleft, vtranspose0); + vleft = _mm256_permute4x64_epi64(vleft, _MM_SHUFFLE(3, 1, 2, 0)); + vleft = _mm256_shuffle_epi8(vleft, vtranspose1); + + __m256i vlo = _mm256_unpacklo_epi8(vdst, vleft); + __m256i vhi = _mm256_unpackhi_epi8(vdst, vleft); + + __m256i vmaddlo = _mm256_maddubs_epi16(vlo, vweight); + __m256i vmaddhi = _mm256_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm256_add_epi16(vmaddlo, v32s); + vmaddhi = _mm256_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm256_srai_epi16(vmaddlo, 6); + vmaddhi = _mm256_srai_epi16(vmaddhi, 6); + + __m256i packed = _mm256_packus_epi16(vmaddlo, vmaddhi); + + // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. + // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity + if (width == 8) { + _mm256_store_si256((__m256i*)(dst + (y * width)), packed); + } + else { + *(uint64_t*)(dst + (y + 0) * width) = _mm256_extract_epi64(packed, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm256_extract_epi64(packed, 1); + *(uint64_t*)(dst + (y + 2) * width) = _mm256_extract_epi64(packed, 2); + *(uint64_t*)(dst + (y + 3) * width) = _mm256_extract_epi64(packed, 3); + } + } +} + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) { @@ -3841,7 +3920,7 @@ static void uvg_angular_pred_avx2( } else if (scale == 1) { if (mode_disp < 8) - angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); } diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 1acbaa9b..b373cea4 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1334,15 +1334,23 @@ static ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { 32, 32, 16, 16, 8, 8, 4, 4, 2, 2, 1, 1, 0, 0, 0, 0, // Scale 2 }; + // Weights for improved PDPC -// Weights for intra pdpc w4 vertical. +// Weights for improved intra pdpc w4 vertical. ALIGNED(32) const uint8_t intra_pdpc_w4_ver_improved_weight[] = { 32, 32, 56, 8, 62, 2, 64, 0, 32, 32, 56, 8, 62, 2, 64, 0, // Scale 0 32, 32, 48, 16, 56, 8, 60, 4, 32, 32, 48, 16, 56, 8, 60, 4, // Scale 1 32, 32, 32, 32, 48, 16, 48, 16, 32, 32, 32, 32, 48, 16, 48, 16, // Scale 2 }; +// Weights for improved intra pdpc w8 vertical. +ALIGNED(32) const uint8_t intra_pdpc_w8_ver_improved_weight[] = { + 32, 32, 56, 8, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 32, 32, 56, 8, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 0 + 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1, 64, 0, 64, 0, 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1, 64, 0, 64, 0, // Scale 1 + 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, // Scale 2 +}; + // Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. // Index by y or x based on pdpc direction. From 636e1837e21af8cf0621470f7c22f6a2fadda211 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 6 Aug 2024 18:11:31 +0300 Subject: [PATCH 193/237] Improve vertical 8x2 scale 1 PDPC. --- src/strategies/avx2/intra-avx2.c | 59 +++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 095b8c43..dbdd4f2d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3062,6 +3062,63 @@ static void angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(uvg_pixel* dst, } } +static void angular_pdpc_ver_8x2_scale1_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. + const int scale = 1; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m128i vseq = _mm_set_epi64x(1, 0); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m128i v32s = _mm_set1_epi16(32); + + const int offset = scale * 32; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w8_ver_improved_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_8x2_scale1_ver[shuf_offset]); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + __m128i vleft = _mm_loadu_si128((__m128i*) &ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. + // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity + if (width == 8) { + _mm_store_si128((__m128i*)(dst + (y * width)), packed); + } + else { + *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(packed, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(packed, 1); + } + } +} + + + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) { @@ -3922,7 +3979,7 @@ static void uvg_angular_pred_avx2( if (mode_disp < 8) angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x2_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); } else { if (mode_disp < 10) From df26760819112b0e1421dd434c5b72c9b923be64 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 6 Aug 2024 18:21:34 +0300 Subject: [PATCH 194/237] Further improve vertical 8x2 scale 1. Renamed to 8x4 as it now handles 4 rows at a time and uses 265-bit vectors. --- src/strategies/avx2/intra-avx2.c | 51 ++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index dbdd4f2d..edb813c6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3062,7 +3062,7 @@ static void angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(uvg_pixel* dst, } } -static void angular_pdpc_ver_8x2_scale1_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_8x4_scale1_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. @@ -3072,47 +3072,52 @@ static void angular_pdpc_ver_8x2_scale1_improved_avx2(uvg_pixel* dst, const uvg_ const int limit = 6; - __m128i vseq = _mm_set_epi64x(1, 0); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m128i v32s = _mm_set1_epi16(32); + __m256i vseq = _mm256_setr_epi64x(0, 1, 2, 3); + __m256i vidx = _mm256_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); const int offset = scale * 32; const int inv_angle_offset = mode_disp * 64; const int shuf_offset = mode_disp * 16; - const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w8_ver_improved_weight[offset]); + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_ver_improved_weight[offset]); const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_8x2_scale1_ver[shuf_offset]); - // For width 8, height must be at least 2. Handle 2 lines at once. - for (int y = 0; y < height; y += 2) { - __m128i vleft = _mm_loadu_si128((__m128i*) &ref_side[y + shifted_inv_angle_sum[0] + 1]); - vleft = _mm_shuffle_epi8(vleft, vshuf); + // For width 8, height must be at least 4 as PDPC is not done when height < 4. Handle 4 lines at once. + for (int y = 0; y < height; y += 4) { + __m128i vleft0 = _mm_loadu_si128((__m128i*) &ref_side[(y + 0) + shifted_inv_angle_sum[0] + 1]); + __m128i vleft1 = _mm_loadu_si128((__m128i*) &ref_side[(y + 2) + shifted_inv_angle_sum[0] + 1]); + vleft0 = _mm_shuffle_epi8(vleft0, vshuf); + vleft1 = _mm_shuffle_epi8(vleft1, vshuf); - __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); + __m256i vleft = _mm256_inserti128_si256(_mm256_castsi128_si256(vleft0), vleft1, 1); + __m256i vdst = _mm256_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); - __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + __m256i vlo = _mm256_unpacklo_epi8(vdst, vleft); + __m256i vhi = _mm256_unpackhi_epi8(vdst, vleft); - __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); - __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + __m256i vmaddlo = _mm256_maddubs_epi16(vlo, vweight); + __m256i vmaddhi = _mm256_maddubs_epi16(vhi, vweight); - vmaddlo = _mm_add_epi16(vmaddlo, v32s); - vmaddhi = _mm_add_epi16(vmaddhi, v32s); + vmaddlo = _mm256_add_epi16(vmaddlo, v32s); + vmaddhi = _mm256_add_epi16(vmaddhi, v32s); - vmaddlo = _mm_srai_epi16(vmaddlo, 6); - vmaddhi = _mm_srai_epi16(vmaddhi, 6); + vmaddlo = _mm256_srai_epi16(vmaddlo, 6); + vmaddhi = _mm256_srai_epi16(vmaddhi, 6); - __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + __m256i packed = _mm256_packus_epi16(vmaddlo, vmaddhi); // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity if (width == 8) { - _mm_store_si128((__m128i*)(dst + (y * width)), packed); + _mm256_store_si256((__m256i*)(dst + (y * width)), packed); } else { - *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(packed, 0); - *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(packed, 1); + *(uint64_t*)(dst + (y + 0) * width) = _mm256_extract_epi64(packed, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm256_extract_epi64(packed, 1); + *(uint64_t*)(dst + (y + 2) * width) = _mm256_extract_epi64(packed, 2); + *(uint64_t*)(dst + (y + 3) * width) = _mm256_extract_epi64(packed, 3); } } } @@ -3979,7 +3984,7 @@ static void uvg_angular_pred_avx2( if (mode_disp < 8) angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_8x2_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); } else { if (mode_disp < 10) From 645fc0ee0b43cf5f1b847ce0b67d5c002f9ac6ab Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 7 Aug 2024 12:37:27 +0300 Subject: [PATCH 195/237] Disable if branch which was never accessed. Width 8 scale 2 cases for PDPC do not exist. --- src/strategies/avx2/intra-avx2.c | 59 +++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index edb813c6..a82b3853 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2787,7 +2787,7 @@ static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* // Mode 50 -// Other modes +// Vertical modes static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { @@ -3123,6 +3123,54 @@ static void angular_pdpc_ver_8x4_scale1_improved_avx2(uvg_pixel* dst, const uvg_ } +static void angular_pdpc_ver_w8_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int mode_disp) +{ + // Only handles cases where scale is 2. + const int width = 8; + const int scale = 2; + + int limit = MIN(3 << scale, width); + + __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 32; + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_improved_weight[offset]); + + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + ALIGNED(32) int16_t left[16] = { 0 }; + for (int xx = 0; xx < limit; ++xx) { + for (int yy = 0; yy < 2; ++yy) { + left[yy * width + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + } + + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vseq, 8); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } +} + + +// Horizontal modes + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) @@ -3980,18 +4028,19 @@ static void uvg_angular_pred_avx2( else angular_pdpc_ver_4x4_scale0_improved_avx2(dst, ref_side, width, height, mode_disp); } - else if (scale == 1) { + else /*if (scale == 1)*/ { if (mode_disp < 8) angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else angular_pdpc_ver_8x4_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); } - else { + // This branch was never executed. There is no case where width == 8 and scale == 2 and PDPC is enabled. + /*else { if (mode_disp < 10) - angular_pdpc_ver_w8_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_w8_high_angle_improved_avx2(dst, ref_side, height, mode_disp); else angular_pdpc_ver_8x2_scale2_avx2(dst, ref_side, width, height, mode_disp); - } + }*/ break; case 16: // 16 width and higher done with the same functions case 32: From 683faa6aba72c5762501aad4f15fc3108e0a1999 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 7 Aug 2024 16:05:12 +0300 Subject: [PATCH 196/237] Improve vertical w16 high angle PDPC. --- src/strategies/avx2/intra-avx2.c | 126 +++++++++++++++++------- src/strategies/avx2/intra_avx2_tables.h | 7 ++ 2 files changed, 96 insertions(+), 37 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a82b3853..e4de03ae 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3123,48 +3123,100 @@ static void angular_pdpc_ver_8x4_scale1_improved_avx2(uvg_pixel* dst, const uvg_ } -static void angular_pdpc_ver_w8_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int mode_disp) +static void angular_pdpc_ver_w16_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { - // Only handles cases where scale is 2. - const int width = 8; - const int scale = 2; - - int limit = MIN(3 << scale, width); - - __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width __m256i v32s = _mm256_set1_epi16(32); + const int scale = 2; // Other functions handle scales 0 and 1 + int limit = 12; // With scale 2, limit is always 12. const int offset = scale * 32; - const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_improved_weight[offset]); + const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w16_ver_improved_weight[offset]); const int inv_angle_offset = mode_disp * 64; const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - // For width 8, height must be at least 2. Handle 2 lines at once. - for (int y = 0; y < height; y += 2) { - ALIGNED(32) int16_t left[16] = { 0 }; - for (int xx = 0; xx < limit; ++xx) { - for (int yy = 0; yy < 2; ++yy) { - left[yy * width + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - } - - __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vseq, 8); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); + const __m256i vidx0 = _mm256_setr_epi32(shifted_inv_angle_sum[0], shifted_inv_angle_sum[1], + shifted_inv_angle_sum[2], shifted_inv_angle_sum[3], + shifted_inv_angle_sum[4], shifted_inv_angle_sum[5], + shifted_inv_angle_sum[6], shifted_inv_angle_sum[7]); + const __m256i vidx1 = _mm256_setr_epi32(shifted_inv_angle_sum[8], shifted_inv_angle_sum[9], + shifted_inv_angle_sum[10], shifted_inv_angle_sum[11], + shifted_inv_angle_sum[12], shifted_inv_angle_sum[13], // These are not used. + shifted_inv_angle_sum[14], shifted_inv_angle_sum[15]); // These are not used. - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); + const __m256i transpose = _mm256_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f, + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); + // 0xff are 'don't care' values, they will be zeroed out by coefficients + const __m256i vpermute = _mm256_setr_epi32( + 0x00, 0x04, 0x02, 0xff, 0x01, 0x05, 0x03, 0xff + ); - _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + // Handle 4 rows at once to enable gather for ref pixels. + for (int y = 0; y < height; y += 4) { + __m128i vdstraw0 = _mm_load_si128((const __m128i*)(dst + ((y + 0) * width))); + __m128i vdstraw1 = _mm_load_si128((const __m128i*)(dst + ((y + 1) * width))); + __m128i vdstraw2 = _mm_load_si128((const __m128i*)(dst + ((y + 2) * width))); + __m128i vdstraw3 = _mm_load_si128((const __m128i*)(dst + ((y + 3) * width))); + + __m256i vdst0 = _mm256_inserti128_si256(_mm256_castsi128_si256(vdstraw0), vdstraw1, 1); + __m256i vdst1 = _mm256_inserti128_si256(_mm256_castsi128_si256(vdstraw2), vdstraw3, 1); + + __m256i vleft0 = _mm256_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidx0, 1); + __m256i vleft1 = _mm256_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidx1, 1); + vleft0 = _mm256_shuffle_epi8(vleft0, transpose); + vleft1 = _mm256_shuffle_epi8(vleft1, transpose); + + __m256i vtmplo = _mm256_unpacklo_epi64(vleft0, vleft1); + __m256i vtmphi = _mm256_unpackhi_epi64(vleft0, vleft1); + + vleft0 = _mm256_permutevar8x32_epi32(vtmplo, vpermute); + vleft1 = _mm256_permutevar8x32_epi32(vtmphi, vpermute); + + __m256i vlo0 = _mm256_unpacklo_epi8(vdst0, vleft0); + __m256i vhi0 = _mm256_unpackhi_epi8(vdst0, vleft0); + __m256i vlo1 = _mm256_unpacklo_epi8(vdst1, vleft1); + __m256i vhi1 = _mm256_unpackhi_epi8(vdst1, vleft1); + + __m256i v0 = _mm256_permute2x128_si256(vlo0, vhi0, 0x20); + __m256i v1 = _mm256_permute2x128_si256(vlo0, vhi0, 0x31); + __m256i v2 = _mm256_permute2x128_si256(vlo1, vhi1, 0x20); + __m256i v3 = _mm256_permute2x128_si256(vlo1, vhi1, 0x31); + + __m256i vmadd0 = _mm256_maddubs_epi16(v0, vweight); + __m256i vmadd1 = _mm256_maddubs_epi16(v1, vweight); + __m256i vmadd2 = _mm256_maddubs_epi16(v2, vweight); + __m256i vmadd3 = _mm256_maddubs_epi16(v3, vweight); + + vmadd0 = _mm256_add_epi16(vmadd0, v32s); + vmadd1 = _mm256_add_epi16(vmadd1, v32s); + vmadd2 = _mm256_add_epi16(vmadd2, v32s); + vmadd3 = _mm256_add_epi16(vmadd3, v32s); + + vmadd0 = _mm256_srai_epi16(vmadd0, 6); + vmadd1 = _mm256_srai_epi16(vmadd1, 6); + vmadd2 = _mm256_srai_epi16(vmadd2, 6); + vmadd3 = _mm256_srai_epi16(vmadd3, 6); + + __m256i packed0 = _mm256_packus_epi16(vmadd0, vmadd1); + __m256i packed1 = _mm256_packus_epi16(vmadd2, vmadd3); + packed0 = _mm256_permute4x64_epi64(packed0, _MM_SHUFFLE(3, 1, 2, 0)); + packed1 = _mm256_permute4x64_epi64(packed1, _MM_SHUFFLE(3, 1, 2, 0)); + + if (width == 16) { + _mm256_store_si256((__m256i*)(dst + ((y + 0) * width)), packed0); + _mm256_store_si256((__m256i*)(dst + ((y + 2) * width)), packed1); + } + else { + _mm_store_si128((__m128i*)(dst + ((y + 0) * width)), _mm256_extracti128_si256(packed0, 0)); + _mm_store_si128((__m128i*)(dst + ((y + 1) * width)), _mm256_extracti128_si256(packed0, 1)); + _mm_store_si128((__m128i*)(dst + ((y + 2) * width)), _mm256_extracti128_si256(packed1, 0)); + _mm_store_si128((__m128i*)(dst + ((y + 3) * width)), _mm256_extracti128_si256(packed1, 1)); + } } } @@ -4037,7 +4089,7 @@ static void uvg_angular_pred_avx2( // This branch was never executed. There is no case where width == 8 and scale == 2 and PDPC is enabled. /*else { if (mode_disp < 10) - angular_pdpc_ver_w8_high_angle_improved_avx2(dst, ref_side, height, mode_disp); + angular_pdpc_ver_w8_high_angle_avx2(dst, ref_side, height, mode_disp); else angular_pdpc_ver_8x2_scale2_avx2(dst, ref_side, width, height, mode_disp); }*/ @@ -4048,19 +4100,19 @@ static void uvg_angular_pred_avx2( switch (scale) { case 0: if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_improved_avx2(dst, ref_side, width, height, mode_disp); break; case 1: if (mode_disp < 8) - angular_pdpc_ver_8x2_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_8x2_scale1_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); break; case 2: if (mode_disp < 14) - angular_pdpc_ver_w16_high_angle_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_w16_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else angular_pdpc_ver_w16_scale2_avx2(dst, ref_side, width, height, mode_disp); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index b373cea4..eb134d89 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1351,6 +1351,13 @@ ALIGNED(32) const uint8_t intra_pdpc_w8_ver_improved_weight[] = { 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, // Scale 2 }; +// Weights for improved intra pdpc w16 vertical. +ALIGNED(32) const uint8_t intra_pdpc_w16_ver_improved_weight[] = { + 32, 32, 56, 8, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 0 + 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 1 + 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, 62, 2, 62, 2, 63, 1, 63, 1, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 2 +}; + // Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. // Index by y or x based on pdpc direction. From 8589026ff842f2ba641e1a9eacaab209d3349697 Mon Sep 17 00:00:00 2001 From: siivonek Date: Wed, 7 Aug 2024 16:46:01 +0300 Subject: [PATCH 197/237] Improve vertical w16 PDPC. --- src/strategies/avx2/intra-avx2.c | 44 +++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e4de03ae..00cbeefd 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3220,6 +3220,48 @@ static void angular_pdpc_ver_w16_high_angle_improved_avx2(uvg_pixel* dst, const } } +static void angular_pdpc_ver_w16_scale2_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + __m128i v32s = _mm_set1_epi16(32); + const int scale = 2; // Other functions handle scales 0 and 1 + int limit = 12; // With scale 2, limit is always 12. + + const int offset = scale * 32; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m128i vweightlo = _mm_load_si128((const __m128i*) &intra_pdpc_w16_ver_improved_weight[offset + 0]); + const __m128i vweighthi = _mm_load_si128((const __m128i*) &intra_pdpc_w16_ver_improved_weight[offset + 16]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_load_si128((const __m128i*) & intra_pdpc_shuffle_vectors_w16_scale2_ver[shuf_offset]); + + // Handle 2 rows at once. + for (int y = 0; y < height; ++y) { + for (int x = 0; x < limit; x += 16) { + __m128i vleft = _mm_loadu_si128((__m128i*) &ref_side[(y + 0) + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_load_si128((const __m128i*)(dst + ((y + 0) * width + x))); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweightlo); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweighthi); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + _mm_store_si128((__m128i*)(dst + (y * width + x)), packed); + } + } +} + // Horizontal modes @@ -4114,7 +4156,7 @@ static void uvg_angular_pred_avx2( if (mode_disp < 14) angular_pdpc_ver_w16_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_w16_scale2_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_w16_scale2_improved_avx2(dst, ref_side, width, height, mode_disp); break; default: assert(false && "Intra PDPC: Invalid scale.\n"); From 382edaef271cf248cb5c549abcf5e71d10022c30 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 8 Aug 2024 14:03:59 +0300 Subject: [PATCH 198/237] Improve horizontal w4 PDPC. --- src/strategies/avx2/intra-avx2.c | 64 +++++++++++++++++++++---- src/strategies/avx2/intra_avx2_tables.h | 17 +++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 00cbeefd..90b2934c 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2608,7 +2608,7 @@ static void angular_pdpc_ver_h16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } } -static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -2728,7 +2728,7 @@ static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, } -static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -3265,6 +3265,53 @@ static void angular_pdpc_ver_w16_scale2_improved_avx2(uvg_pixel* dst, const uvg_ // Horizontal modes +static void angular_pdpc_hor_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + const int width = 4; + + int limit = MIN(3 << scale, height); + + // __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + // __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width + __m128i v32s = _mm_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int table_offset = scale * 128; + const int shuf_offset = mode_disp * 256; + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + for (int y = 0, so = 0, wo = 0; y < limit; y += 4, so += 16, wo += 32) { + const __m128i vshuf = _mm_loadu_si128((__m128i*) & intra_pdpc_shuffle_vectors_w4_hor[shuf_offset + so]); + + __m128i vtop = _mm_loadu_si128((__m128i*) & ref_side[shifted_inv_angle_sum[y] + 1]); + vtop = _mm_shuffle_epi8(vtop, vshuf); + + const int offset = table_offset + wo; + + //__m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); + __m128i vweightlo = _mm_load_si128((const __m128i*) &intra_pdpc_w4_hor_improved_weight[offset + 0]); + __m128i vweighthi = _mm_load_si128((const __m128i*) &intra_pdpc_w4_hor_improved_weight[offset + 16]); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vtop); + __m128i vhi = _mm_unpackhi_epi8(vdst, vtop); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweightlo); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweighthi); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), packed); + } +} + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) @@ -4169,12 +4216,13 @@ static void uvg_angular_pred_avx2( else { switch (width) { case 4: - // Low mode disp -> low angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. - // Handle high angles with more tight reference spacing with separate functions with more optimized loads. - if (mode_disp < 6) - angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); - else - angular_pdpc_hor_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); + // Low mode disp -> high angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. + // Handle low angles with more tight reference spacing with separate functions with more optimized loads. + /*if (mode_disp < 6) + angular_pdpc_hor_w4_high_angle_improved_avx2(dst, ref_side, height, scale, mode_disp); + else*/ + // The above code was not accessed ever. There is no case where width == 4 and and mode disp < 6 for horizontal modes where PDPC is enabled. + angular_pdpc_hor_w4_improved_avx2(dst, ref_side, height, scale, mode_disp); break; case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; case 16: // 16 width and higher done with the same function diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index eb134d89..af72b9c1 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1359,6 +1359,23 @@ ALIGNED(32) const uint8_t intra_pdpc_w16_ver_improved_weight[] = { }; +// Weights for improved intra pdpc w4 horizontal. +ALIGNED(32) const uint8_t intra_pdpc_w4_hor_improved_weight[] = { + 32, 32, 32, 32, 32, 32, 32, 32, 56, 8, 56, 8, 56, 8, 56, 8, 62, 2, 62, 2, 62, 2, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 0 + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 48, 16, 48, 16, 48, 16, 48, 16, 56, 8, 56, 8, 56, 8, 56, 8, 60, 4, 60, 4, 60, 4, 60, 4, // Scale 1 + 62, 2, 62, 2, 62, 2, 62, 2, 63, 1, 63, 1, 63, 1, 63, 1, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, // Scale 2 + 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, + 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, +}; + + // Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. // Index by y or x based on pdpc direction. static ALIGNED(32) const int16_t intra_pdpc_shifted_inv_angle_sum[] = { From d51588b7559ea48ac259431e0b3b319287c705d8 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 8 Aug 2024 17:20:11 +0300 Subject: [PATCH 199/237] Improve horizontal w8 PDPC. --- src/strategies/avx2/intra-avx2.c | 51 +++++++++++++++++++++++-- src/strategies/avx2/intra_avx2_tables.h | 28 ++++++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 90b2934c..d4e5a306 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3271,8 +3271,6 @@ static void angular_pdpc_hor_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r int limit = MIN(3 << scale, height); - // __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - // __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width __m128i v32s = _mm_set1_epi16(32); // Scale can be 0, 1 or 2 @@ -3289,7 +3287,6 @@ static void angular_pdpc_hor_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r const int offset = table_offset + wo; - //__m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); __m128i vdst = _mm_load_si128((const __m128i*)(dst + y * width)); __m128i vweightlo = _mm_load_si128((const __m128i*) &intra_pdpc_w4_hor_improved_weight[offset + 0]); __m128i vweighthi = _mm_load_si128((const __m128i*) &intra_pdpc_w4_hor_improved_weight[offset + 16]); @@ -3312,6 +3309,52 @@ static void angular_pdpc_hor_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } } +static void angular_pdpc_hor_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +{ + const int width = 8; + + int limit = MIN(3 << scale, height); + + __m256i v32s = _mm256_set1_epi16(32); + + // Scale can be 0, 1 or 2 + const int table_offset = scale * 256; + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + // Handle 4 lines at once since PDPC is not done on 8x2 blocks. + for (int y = 0, o = table_offset; y < limit; y += 4, o += 64) { + const __m256i vweight01 = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_hor_improved_weight[o + 0]); + const __m256i vweight23 = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_hor_improved_weight[o + 32]); + + const __m256i vidx = _mm256_set_epi64x(shifted_inv_angle_sum[y + 3], shifted_inv_angle_sum[y + 2], + shifted_inv_angle_sum[y + 1], shifted_inv_angle_sum[y + 0]); + + __m256i vdst = _mm256_load_si256((const __m256i*)(dst + y * width)); + __m256i vtop = _mm256_i64gather_epi64((const long long int*)&ref_side[1], vidx, 1); + + __m256i vlo = _mm256_unpacklo_epi8(vdst, vtop); + __m256i vhi = _mm256_unpackhi_epi8(vdst, vtop); + + __m256i v01 = _mm256_permute2x128_si256(vlo, vhi, 0x20); + __m256i v23 = _mm256_permute2x128_si256(vlo, vhi, 0x31); + + __m256i vmadd01 = _mm256_maddubs_epi16(v01, vweight01); + __m256i vmadd23 = _mm256_maddubs_epi16(v23, vweight23); + + vmadd01 = _mm256_add_epi16(vmadd01, v32s); + vmadd23 = _mm256_add_epi16(vmadd23, v32s); + + vmadd01 = _mm256_srai_epi16(vmadd01, 6); + vmadd23 = _mm256_srai_epi16(vmadd23, 6); + + __m256i packed = _mm256_packus_epi16(vmadd01, vmadd23); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_storeu_si256((__m256i*)(dst + (y * width)), packed); + } +} + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) @@ -4224,7 +4267,7 @@ static void uvg_angular_pred_avx2( // The above code was not accessed ever. There is no case where width == 4 and and mode disp < 6 for horizontal modes where PDPC is enabled. angular_pdpc_hor_w4_improved_avx2(dst, ref_side, height, scale, mode_disp); break; - case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; + case 8: angular_pdpc_hor_w8_improved_avx2(dst, ref_side, height, scale, mode_disp); break; case 16: // 16 width and higher done with the same function case 32: case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index af72b9c1..ba6b8bb6 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1375,6 +1375,34 @@ ALIGNED(32) const uint8_t intra_pdpc_w4_hor_improved_weight[] = { 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, }; +// Weights for improved intra pdpc w8 horizontal. +ALIGNED(32) const uint8_t intra_pdpc_w8_hor_improved_weight[] = { + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, // Scale 0 + 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, // Scale 1 + 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, + 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, // Scale 2 + 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, 48, 16, + 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, + 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, 60, 4, + 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, + 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, 63, 1, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, + 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, +}; + // Pre-calculated shifted inverse angle sums for pdpc for y- and x-values [0, 64]. Grouped by mode_disp. // Index by y or x based on pdpc direction. From 8c47928ddb3ed8dfb1b51a9a4cea690c2015a3f7 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 9 Aug 2024 16:52:02 +0300 Subject: [PATCH 200/237] WIP on improving w16 horizontal PDPC. The current solution does not gain any speedups. Try handling more data at a time. --- src/strategies/avx2/intra-avx2.c | 47 +++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d4e5a306..3295ef53 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3355,6 +3355,51 @@ static void angular_pdpc_hor_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } } +static void angular_pdpc_hor_w16_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) +{ + int limit = MIN(3 << scale, height); + __m128i v32s = _mm_set1_epi16(32); + + const int inv_angle_offset = mode_disp * 64; + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + + const __m128i vblend = _mm_setr_epi8( + 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, + 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff + ); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const uint8_t weight1 = 32 >> (2 * y >> scale); + const uint8_t weight0 = 64 - weight1; + __m128i vw0 = _mm_set1_epi8(weight0); + __m128i vw1 = _mm_set1_epi8(weight1); + + __m128i vweight = _mm_blendv_epi8(vw0, vw1, vblend); + + for (int x = 0; x < width; x += 16) { + __m128i vdst = _mm_load_si128((__m128i*)(dst + (y * width + x))); + __m128i vtop = _mm_loadu_si128((__m128i*) &ref_side[x + shifted_inv_angle_sum[y] + 1]); + + __m128i vlo = _mm_unpacklo_epi8(vdst, vtop); + __m128i vhi = _mm_unpackhi_epi8(vdst, vtop); + + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); + + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); + + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + _mm_storeu_si128((__m128i*)(dst + (y * width + x)), packed); + } + } +} + // This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) @@ -4270,7 +4315,7 @@ static void uvg_angular_pred_avx2( case 8: angular_pdpc_hor_w8_improved_avx2(dst, ref_side, height, scale, mode_disp); break; case 16: // 16 width and higher done with the same function case 32: - case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; + case 64: angular_pdpc_hor_w16_improved_avx2(dst, ref_side, width, height, scale, mode_disp); break; default: assert(false && "Intra PDPC: Invalid width.\n"); } From f58c8bf489dc7272211def1b5a9255da38ebedeb Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 12 Aug 2024 14:20:26 +0300 Subject: [PATCH 201/237] Good enough. It is now consistent with other PDPC solutions, leave it as is. --- src/strategies/avx2/intra-avx2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3295ef53..1eea9b6e 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3363,19 +3363,21 @@ static void angular_pdpc_hor_w16_improved_avx2(uvg_pixel* dst, const uvg_pixel* const int inv_angle_offset = mode_disp * 64; const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - const __m128i vblend = _mm_setr_epi8( + /*const __m128i vblend = _mm_setr_epi8( 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff - ); + );*/ // Handle one line at a time. Skip line if vertical limit reached. for (int y = 0; y < limit; ++y) { const uint8_t weight1 = 32 >> (2 * y >> scale); const uint8_t weight0 = 64 - weight1; - __m128i vw0 = _mm_set1_epi8(weight0); - __m128i vw1 = _mm_set1_epi8(weight1); + ALIGNED(2) const uint8_t tmp[2] = { weight0, weight1 }; + // __m128i vw0 = _mm_set1_epi8(weight0); + // __m128i vw1 = _mm_set1_epi8(weight1); - __m128i vweight = _mm_blendv_epi8(vw0, vw1, vblend); + //__m128i vweight = _mm_blendv_epi8(vw0, vw1, vblend); + __m128i vweight = _mm_set1_epi16(*(uint16_t*)tmp); for (int x = 0; x < width; x += 16) { __m128i vdst = _mm_load_si128((__m128i*)(dst + (y * width + x))); From 29e954f692374ab0efd4199b173a4ac5dadbd11f Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 12 Aug 2024 15:23:35 +0300 Subject: [PATCH 202/237] Improve mode18 w4 PDPC. --- src/strategies/avx2/intra-avx2.c | 52 ++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1eea9b6e..c1e8f7a1 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2783,6 +2783,48 @@ static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, // Mode 18 +static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 4; + const int limit = MIN(3 << scale, height); + + //__m128i vseq = _mm_setr_epi32(0, 1, 2, 3); + //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width + __m256i v32s = _mm256_set1_epi16(32); + + const uint32_t ref4 = *(uint32_t*)&ref_side[1]; + + __m128i vref = _mm_set1_epi32(ref4); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Weight table offset + const int table_offset = scale * 64; + + for (int y = 0, o = 0; y < limit; y += 4, o += 16) { + const int offset = table_offset + o; + + //__m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); + __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); + + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w4_hor_weight[offset]); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + // Mode 50 @@ -3363,20 +3405,12 @@ static void angular_pdpc_hor_w16_improved_avx2(uvg_pixel* dst, const uvg_pixel* const int inv_angle_offset = mode_disp * 64; const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - /*const __m128i vblend = _mm_setr_epi8( - 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, - 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff - );*/ - // Handle one line at a time. Skip line if vertical limit reached. for (int y = 0; y < limit; ++y) { const uint8_t weight1 = 32 >> (2 * y >> scale); const uint8_t weight0 = 64 - weight1; ALIGNED(2) const uint8_t tmp[2] = { weight0, weight1 }; - // __m128i vw0 = _mm_set1_epi8(weight0); - // __m128i vw1 = _mm_set1_epi8(weight1); - //__m128i vweight = _mm_blendv_epi8(vw0, vw1, vblend); __m128i vweight = _mm_set1_epi16(*(uint16_t*)tmp); for (int x = 0; x < width; x += 16) { @@ -4209,7 +4243,7 @@ static void uvg_angular_pred_avx2( const uvg_pixel top_left = ref_main[0]; switch (width) { - case 4: angular_pdpc_mode18_w4_avx2(dst, top_left, ref_side, height, scale); break; + case 4: angular_pdpc_mode18_w4_improved_avx2(dst, top_left, ref_side, height, scale); break; case 8: angular_pdpc_mode18_w8_avx2(dst, top_left, ref_side, height, scale); break; case 16: angular_pdpc_mode18_w16_avx2(dst, top_left, ref_side, height, scale); break; case 32: angular_pdpc_mode18_w32_avx2(dst, top_left, ref_side, height, scale); break; From eecfaa9909e7454fccb0804460fcb25c6f0fc378 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 12 Aug 2024 15:31:58 +0300 Subject: [PATCH 203/237] Improve mode18 w8 PDPC. --- src/strategies/avx2/intra-avx2.c | 46 +++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c1e8f7a1..5ef7ee3d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2788,8 +2788,6 @@ static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel const int width = 4; const int limit = MIN(3 << scale, height); - //__m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width __m256i v32s = _mm256_set1_epi16(32); const uint32_t ref4 = *(uint32_t*)&ref_side[1]; @@ -2805,7 +2803,6 @@ static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel for (int y = 0, o = 0; y < limit; y += 4, o += 16) { const int offset = table_offset + o; - //__m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); @@ -2825,6 +2822,47 @@ static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel } } +static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 8; + + int limit = MIN(3 << scale, height); + + //__m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); + //__m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width + __m256i v32s = _mm256_set1_epi16(32); + + const uint64_t ref8 = *(uint64_t*)&ref_side[1]; + + __m128i vref = _mm_set1_epi64x(ref8); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Weight table offset + const int table_offset = scale * 128; + + for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { + const __m256i vwT = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_hor_weight[o]); + + //__m128i vpred = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); + __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + // Mode 50 @@ -4244,7 +4282,7 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pdpc_mode18_w4_improved_avx2(dst, top_left, ref_side, height, scale); break; - case 8: angular_pdpc_mode18_w8_avx2(dst, top_left, ref_side, height, scale); break; + case 8: angular_pdpc_mode18_w8_improved_avx2(dst, top_left, ref_side, height, scale); break; case 16: angular_pdpc_mode18_w16_avx2(dst, top_left, ref_side, height, scale); break; case 32: angular_pdpc_mode18_w32_avx2(dst, top_left, ref_side, height, scale); break; case 64: angular_pdpc_mode18_w64_avx2(dst, top_left, ref_side, height, scale); break; From b02256782322bfb6b99c5ae581649f2933b95009 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 12 Aug 2024 16:31:41 +0300 Subject: [PATCH 204/237] Improve mode18 w32 and w64 PDPC. --- src/strategies/avx2/intra-avx2.c | 132 +++++++++++++++++++++++++++++-- 1 file changed, 126 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5ef7ee3d..3fbe295d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2825,11 +2825,8 @@ static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { const int width = 8; - int limit = MIN(3 << scale, height); - //__m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - //__m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width __m256i v32s = _mm256_set1_epi16(32); const uint64_t ref8 = *(uint64_t*)&ref_side[1]; @@ -2845,7 +2842,6 @@ static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { const __m256i vwT = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_hor_weight[o]); - //__m128i vpred = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); @@ -2863,6 +2859,130 @@ static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel } } +// Can't do anything to improve w16 + +static void angular_pdpc_mode18_w32_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 32; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); + + __m128i vrefa = _mm_loadu_si128((const __m128i*) & ref_side[1]); + __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); + + __m128i vrefb = _mm_loadu_si128((const __m128i*) & ref_side[17]); + __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + + // Calculate first half + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + 0))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu0 = _mm256_sub_epi16(vref16a, vtopleft); + accu0 = _mm256_mullo_epi16(vwT, accu0); + accu0 = _mm256_add_epi16(accu0, v32s); + accu0 = _mm256_srai_epi16(accu0, 6); + accu0 = _mm256_add_epi16(vpred16, accu0); + + // Calculate second half + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 16))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu1 = _mm256_sub_epi16(vref16b, vtopleft); + accu1 = _mm256_mullo_epi16(vwT, accu1); + accu1 = _mm256_add_epi16(accu1, v32s); + accu1 = _mm256_srai_epi16(accu1, 6); + accu1 = _mm256_add_epi16(vpred16, accu1); + + // Store results + __m256i packed = _mm256_packus_epi16(accu0, accu1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (y * width)), packed); + } +} + +static void angular_pdpc_mode18_w64_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 64; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); + + __m128i vrefa = _mm_loadu_si128((const __m128i*) &ref_side[0 + 1]); + __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); + + __m128i vrefb = _mm_loadu_si128((const __m128i*) &ref_side[16 + 1]); + __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); + + __m128i vrefc = _mm_loadu_si128((const __m128i*) &ref_side[32 + 1]); + __m256i vref16c = _mm256_cvtepu8_epi16(vrefc); + + __m128i vrefd = _mm_loadu_si128((const __m128i*) &ref_side[48 + 1]); + __m256i vref16d = _mm256_cvtepu8_epi16(vrefd); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + + // Calculate first quarter + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + 0))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu0 = _mm256_sub_epi16(vref16a, vtopleft); + accu0 = _mm256_mullo_epi16(vwT, accu0); + accu0 = _mm256_add_epi16(accu0, v32s); + accu0 = _mm256_srai_epi16(accu0, 6); + accu0 = _mm256_add_epi16(vpred16, accu0); + + // Calculate second quarter + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 16))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu1 = _mm256_sub_epi16(vref16b, vtopleft); + accu1 = _mm256_mullo_epi16(vwT, accu1); + accu1 = _mm256_add_epi16(accu1, v32s); + accu1 = _mm256_srai_epi16(accu1, 6); + accu1 = _mm256_add_epi16(vpred16, accu1); + + // Calculate third quarter + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 32))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu2 = _mm256_sub_epi16(vref16c, vtopleft); + accu2 = _mm256_mullo_epi16(vwT, accu2); + accu2 = _mm256_add_epi16(accu2, v32s); + accu2 = _mm256_srai_epi16(accu2, 6); + accu2 = _mm256_add_epi16(vpred16, accu2); + + // Calculate fourth quarter + vpred = _mm_load_si128((__m128i*)(dst + (y * width + 48))); + vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu3 = _mm256_sub_epi16(vref16d, vtopleft); + accu3 = _mm256_mullo_epi16(vwT, accu3); + accu3 = _mm256_add_epi16(accu3, v32s); + accu3 = _mm256_srai_epi16(accu3, 6); + accu3 = _mm256_add_epi16(vpred16, accu3); + + __m256i packed0 = _mm256_packus_epi16(accu0, accu1); + __m256i packed1 = _mm256_packus_epi16(accu2, accu3); + packed0 = _mm256_permute4x64_epi64(packed0, _MM_SHUFFLE(3, 1, 2, 0)); + packed1 = _mm256_permute4x64_epi64(packed1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(dst + (y * width + 0)), packed0); + _mm256_store_si256((__m256i*)(dst + (y * width + 32)), packed1); + } +} + // Mode 50 @@ -4284,8 +4404,8 @@ static void uvg_angular_pred_avx2( case 4: angular_pdpc_mode18_w4_improved_avx2(dst, top_left, ref_side, height, scale); break; case 8: angular_pdpc_mode18_w8_improved_avx2(dst, top_left, ref_side, height, scale); break; case 16: angular_pdpc_mode18_w16_avx2(dst, top_left, ref_side, height, scale); break; - case 32: angular_pdpc_mode18_w32_avx2(dst, top_left, ref_side, height, scale); break; - case 64: angular_pdpc_mode18_w64_avx2(dst, top_left, ref_side, height, scale); break; + case 32: angular_pdpc_mode18_w32_improved_avx2(dst, top_left, ref_side, height, scale); break; + case 64: angular_pdpc_mode18_w64_improved_avx2(dst, top_left, ref_side, height, scale); break; default: assert(false && "Intra PDPC, invalid width.\n"); break; From 0e238f0af72f787e25aa2555b98d72bd54803890 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 13 Aug 2024 16:06:19 +0300 Subject: [PATCH 205/237] Improve intra horizontal w4. Change the way filters are accessed. Take unnecessary stuff out of the loop. Significant speed increase. --- src/strategies/avx2/intra-avx2.c | 105 ++++++++++++++++++++---- src/strategies/avx2/intra_avx2_tables.h | 70 ++++++++++++++++ 2 files changed, 159 insertions(+), 16 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 3fbe295d..a985cc55 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -55,7 +55,7 @@ #include "strategies/missing-intel-intrinsics.h" -static const int16_t cubic_filter[32][4] = +ALIGNED(32) static const int16_t cubic_filter[32][4] = { { 0, 64, 0, 0 }, { -1, 63, 2, 0 }, @@ -94,7 +94,7 @@ static const int16_t cubic_filter[32][4] = // Specified in JVET-T2001 8.4.5.2.13 Table 25 // These are the fC interpolation filter coefficients -static const int8_t cubic_filter_8bit_c[32][4] = +ALIGNED(32) static const int8_t cubic_filter_8bit_c[32][4] = { { 0, 64, 0, 0 }, { -1, 63, 2, 0 }, @@ -132,7 +132,7 @@ static const int8_t cubic_filter_8bit_c[32][4] = // Specified in JVET-T2001 8.4.5.2.13 Table 25 // These are the fG interpolation filter coefficients -static const int8_t cubic_filter_8bit_g[32][4] = +ALIGNED(32) static const int8_t cubic_filter_8bit_g[32][4] = { {16, 32, 16, 0}, {16, 32, 16, 0}, @@ -430,7 +430,7 @@ static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } -static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const bool use_cubic) { const int width = 4; @@ -475,16 +475,16 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, // Do 4-tap intra interpolation filtering uvg_pixel* p = (uvg_pixel*)(ref_main + y); - for (int_fast32_t x = 0; x < width; x += 4) { + if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[x + 0]], 8); - memcpy(f[1], cubic_filter[delta_fract[x + 1]], 8); - memcpy(f[2], cubic_filter[delta_fract[x + 2]], 8); - memcpy(f[3], cubic_filter[delta_fract[x + 3]], 8); + memcpy(f[0], cubic_filter[delta_fract[0]], 8); + memcpy(f[1], cubic_filter[delta_fract[1]], 8); + memcpy(f[2], cubic_filter[delta_fract[2]], 8); + memcpy(f[3], cubic_filter[delta_fract[3]], 8); } else { for (int xx = 0; xx < 4; ++xx) { - const int16_t offset = (delta_fract[x + xx] >> 1); + const int16_t offset = (delta_fract[xx] >> 1); f[xx][0] = 16 - offset; f[xx][1] = 32 - offset; f[xx][2] = 16 + offset; @@ -494,10 +494,10 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, // This solution assumes the delta int values to be 64-bit // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi64x(delta_int[x + 0], - delta_int[x + 1], - delta_int[x + 2], - delta_int[x + 3]); + __m256i vidx = _mm256_setr_epi64x(delta_int[0], + delta_int[1], + delta_int[2], + delta_int[3]); __m256i all_weights = _mm256_loadu_si256((__m256i*)f); __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); @@ -522,7 +522,71 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, _mm_store_si128((__m128i*)dst, _mm_shuffle_epi8(filtered, r_shuffle)); dst += 16; - } + + } +} + +static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t (*filter)[4]) +{ + // const int width = 4; + + const __m256i w_shuf_01 = _mm256_setr_epi8( + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d + ); + + const __m256i w_shuf_23 = _mm256_setr_epi8( + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f + ); + + const int mode_idx = pred_mode <= 34 ? pred_mode - 2 : 66 - pred_mode; + const int table_offset = mode_idx * 64; + + const __m256i vpshuf0 = _mm256_load_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w4_hor[table_offset + 0]); + const __m256i vpshuf1 = _mm256_load_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w4_hor[table_offset + 32]); + + // Case for pred mode 2 where offset is 1, do not need to be handled, as that mode is handled by other function. Positive numbers would brake the indexing. + // Valid ref offsets are negative, in range -1 to -4 + int ref_offset = MIN(delta_int[0], delta_int[3]); + + // Copy the filter to local memory + __m128i vdfract = _mm_load_si128((__m128i*)delta_fract); + __m128i vidx = _mm_cvtepi16_epi32(vdfract); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)(void*)filter, vidx, 4); + + __m256i weights256 = _mm256_insertf128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); + + // Shuffle the interpolation weights into place. + __m256i w01 = _mm256_shuffle_epi8(weights256, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(weights256, w_shuf_23); + + // 4-tap interpolation filtering. + // For a 4 width block, height must be at least 4. Handle 4 lines at once + for (int y = 0; y < height; y += 4) { + // Load 16 samples and shuffle into place + __m128i vref = _mm_loadu_si128((__m128i*)&ref_main[y + ref_offset]); + __m256i vp = _mm256_insertf128_si256(_mm256_castsi128_si256(vref), vref, 1); + + __m256i vp_01 = _mm256_shuffle_epi8(vp, vpshuf0); + __m256i vp_23 = _mm256_shuffle_epi8(vp, vpshuf1); + + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); + + _mm_store_si128((__m128i*)dst, packed); + dst += 16; } } @@ -4170,6 +4234,9 @@ static void uvg_angular_pred_avx2( use_cubic = true; } + //const int8_t* pfilter = use_cubic ? &cubic_filter_8bit_c[0][0] : &cubic_filter_8bit_g[0][0]; + const int8_t (*pfilter)[4] = use_cubic ? cubic_filter_8bit_c : cubic_filter_8bit_g; + if (sample_disp != 0) { @@ -4201,7 +4268,13 @@ static void uvg_angular_pred_avx2( } else { switch (width) { - case 4: angular_pred_w4_hor_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 4: + if (wide_angle_mode) + angular_pred_w4_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); + else + angular_pred_w4_hor_avx2(dst, ref_main, pred_mode, delta_int, delta_fract, height, pfilter); + + break; case 8: angular_pred_w8_hor_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 16: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; case 32: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index ba6b8bb6..bd409faa 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -505,6 +505,76 @@ static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vector 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, }; +// Intra interpolation shuffle vectors for luma w4 horizontal +ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 3 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 4 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // Mode 5 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 6 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 7 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 8 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 9 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 10 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 11 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 12 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 13 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 14 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 15 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 16 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 17 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 18 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 19 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 20 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 21 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 22 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 23 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 24 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 25 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 26 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 27 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 28 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 29 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 30 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 31 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 32 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 33 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 34 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, +}; + // Chroma linear interpolation filter weights for width 8, vertical modes. These also work for w16 and w32. static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { From 4a19225ceecce54be1bdaada7497f759c982b4fe Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 15 Aug 2024 13:25:04 +0300 Subject: [PATCH 206/237] Improve intra horizontal w4 for wide angles. --- src/strategies/avx2/intra-avx2.c | 97 +++++++++++++------------------- 1 file changed, 40 insertions(+), 57 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a985cc55..5fd7047d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -430,7 +430,7 @@ static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } -static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const bool use_cubic) +static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) { const int width = 4; @@ -449,17 +449,17 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ); const __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a, - 0x00, 0x02, 0x08, 0x0a, 0x00, 0x02, 0x08, 0x0a + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d ); const __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e, - 0x04, 0x06, 0x0c, 0x0e, 0x04, 0x06, 0x0c, 0x0e + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f ); const __m128i r_shuffle = _mm_setr_epi8( @@ -467,62 +467,45 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f ); - int16_t f[4][4] = { { 0 } }; + // Copy the filter to local memory + __m128i vdfract = _mm_load_si128((__m128i*)delta_fract); + __m128i vidx = _mm_cvtepi16_epi32(vdfract); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)(void*)filter, vidx, 4); + + __m256i weights256 = _mm256_insertf128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); + // Shuffle the interpolation weights into place. + __m256i w01 = _mm256_shuffle_epi8(weights256, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(weights256, w_shuf_23); // For a 4 width block, height must be at least 4. Handle 4 lines at once for (int y = 0; y < height; y += 4) { + // This solution assumes the delta int values to be 64-bit + // Cast from 16-bit to 64-bit. + __m128i vidx = _mm_load_si128((__m128i*)delta_int); + __m256i vidx256 = _mm256_cvtepu16_epi64(vidx); - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)(ref_main + y); + __m256i vp = _mm256_i64gather_epi64((const long long int*)&ref_main[y], vidx256, 1); - - if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[0]], 8); - memcpy(f[1], cubic_filter[delta_fract[1]], 8); - memcpy(f[2], cubic_filter[delta_fract[2]], 8); - memcpy(f[3], cubic_filter[delta_fract[3]], 8); - } - else { - for (int xx = 0; xx < 4; ++xx) { - const int16_t offset = (delta_fract[xx] >> 1); - f[xx][0] = 16 - offset; - f[xx][1] = 32 - offset; - f[xx][2] = 16 + offset; - f[xx][3] = offset; - } - } + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); - // This solution assumes the delta int values to be 64-bit - // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi64x(delta_int[0], - delta_int[1], - delta_int[2], - delta_int[3]); - __m256i all_weights = _mm256_loadu_si256((__m256i*)f); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + vp_01 = _mm256_permute4x64_epi64(vp_01, _MM_SHUFFLE(3, 1, 2, 0)); + vp_23 = _mm256_permute4x64_epi64(vp_23, _MM_SHUFFLE(3, 1, 2, 0)); + vp_01 = _mm256_shuffle_epi32(vp_01, _MM_SHUFFLE(3, 1, 2, 0)); + vp_23 = _mm256_shuffle_epi32(vp_23, _MM_SHUFFLE(3, 1, 2, 0)); - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - //__m256i vp = _mm256_loadu_si256((__m256i*)(p + delta_int[y])); - - //__m256i tmp = _mm256_permute4x64_epi64(vp, _MM_SHUFFLE(2, 1, 1, 0)); - - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); - - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); - _mm_store_si128((__m128i*)dst, _mm_shuffle_epi8(filtered, r_shuffle)); - dst += 16; - + _mm_store_si128((__m128i*)dst, packed); + dst += 16; } } @@ -4270,7 +4253,7 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: if (wide_angle_mode) - angular_pred_w4_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); + angular_pred_w4_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); else angular_pred_w4_hor_avx2(dst, ref_main, pred_mode, delta_int, delta_fract, height, pfilter); From 97eebe32d5ae393bc39175d1edded6f8160fb03b Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 15 Aug 2024 16:26:02 +0300 Subject: [PATCH 207/237] Further improve intra horizontal w4. Add optimized wide angles. Add multi ref line handling. --- src/strategies/avx2/intra-avx2.c | 10 +- src/strategies/avx2/intra_avx2_tables.h | 284 +++++++++++++++++++++--- 2 files changed, 255 insertions(+), 39 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5fd7047d..297e844b 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -509,7 +509,7 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } } -static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t (*filter)[4]) +static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t multi_ref_line, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t (*filter)[4]) { // const int width = 4; @@ -527,8 +527,8 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f ); - const int mode_idx = pred_mode <= 34 ? pred_mode - 2 : 66 - pred_mode; - const int table_offset = mode_idx * 64; + const int mode_idx = pred_mode <= 34 ? pred_mode + 12 : 80 - pred_mode; // Considers also wide angle modes. + const int table_offset = mode_idx * 192 + multi_ref_line * 64; const __m256i vpshuf0 = _mm256_load_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w4_hor[table_offset + 0]); const __m256i vpshuf1 = _mm256_load_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w4_hor[table_offset + 32]); @@ -4252,10 +4252,10 @@ static void uvg_angular_pred_avx2( else { switch (width) { case 4: - if (wide_angle_mode) + if (pred_mode < -7 || (multi_ref_index == 2 && pred_mode == -7)) // High angles need special handling angular_pred_w4_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); else - angular_pred_w4_hor_avx2(dst, ref_main, pred_mode, delta_int, delta_fract, height, pfilter); + angular_pred_w4_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); break; case 8: angular_pred_w8_hor_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index bd409faa..7a3eda94 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -505,73 +505,289 @@ static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vector 0x0f, 0x10, 0x0e, 0x0f, 0x0d, 0x0e, 0x0c, 0x0d, 0x0b, 0x0c, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, }; -// Intra interpolation shuffle vectors for luma w4 horizontal +// Intra interpolation shuffle vectors for luma w4 horizontal. Includes wide angle modes [-12, 1]. Wide angle numbering goes from -12 to 1 since planar and DC (0, 1) are not considered angular modes. ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 2 + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -12 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -11 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -10 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -9 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -8 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, // Mode -7 mrl 0 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, // mrl 1 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x0a, 0x0b, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0b, 0x0c, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0c, 0x0d, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0d, 0x0e, // mrl 2 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0c, 0x0d, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0d, 0x0e, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0e, 0x0f, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0f, 0x10, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, // Mode -6 mrl 0 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0c, 0x0d, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, // mrl 1 + 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0c, 0x0d, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, // Mode -5 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, // mrl 2 + 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, // Mode -4 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, // Mode -3 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, // Mode -2 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, // Mode -1 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 0 mrl 0 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 3 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 1 mrl 0 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 4 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 1 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, - 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // Mode 5 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 2 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 3 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 4 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // Mode 5 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 6 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // mrl 2 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 6 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 7 mrl 0 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 7 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // mrl 2 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, - 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 8 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 8 mrl 0 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, - 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 9 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 1 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, - 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 10 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 9 mrl 0 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 11 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 10 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // mrl 2 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 12 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 11 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 12 mrl 0 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 13 + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 13 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 14 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 15 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 16 mrl 0 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 14 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 15 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 16 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 17 mrl 0 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 17 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 18 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 19 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 18 mrl 0 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 20 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 21 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 22 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 19 mrl 0 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 23 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 24 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, - 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 25 + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 20 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 21 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 22 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 23 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // Mode 24 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 1 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 25 mrl 0 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, - 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 26 + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 1 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, - 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 27 + 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 26 mrl 0 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, - 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 28 + 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 27 mrl 0 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, - 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 29 + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 1 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, - 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 30 + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 28 mrl 0 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 1 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 29 mrl 0 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, - 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 31 + 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 2 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // Mode 30 mrl 0 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 1 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // Mode 31 mrl 0 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, - 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 32 + 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, // mrl 1 + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 2 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 32 mrl 0 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 1 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 33 mrl 0 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 1 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 2 + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 34 mrl 0 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, - 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 33 + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 1 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, - 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // Mode 34 + 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, // mrl 2 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, }; From 95b1b4f940c59737adf26e2d35c461699190d180 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 16 Aug 2024 15:56:39 +0300 Subject: [PATCH 208/237] Improve intra horizontal w8. --- src/strategies/avx2/intra-avx2.c | 86 ++++++- src/strategies/avx2/intra_avx2_tables.h | 286 ++++++++++++++++++++++++ 2 files changed, 363 insertions(+), 9 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 297e844b..f40f5d2a 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -462,11 +462,6 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f ); - const __m128i r_shuffle = _mm_setr_epi8( - 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, - 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f - ); - // Copy the filter to local memory __m128i vdfract = _mm_load_si128((__m128i*)delta_fract); __m128i vidx = _mm_cvtepi16_epi32(vdfract); @@ -533,8 +528,6 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const __m256i vpshuf0 = _mm256_load_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w4_hor[table_offset + 0]); const __m256i vpshuf1 = _mm256_load_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w4_hor[table_offset + 32]); - // Case for pred mode 2 where offset is 1, do not need to be handled, as that mode is handled by other function. Positive numbers would brake the indexing. - // Valid ref offsets are negative, in range -1 to -4 int ref_offset = MIN(delta_int[0], delta_int[3]); // Copy the filter to local memory @@ -573,7 +566,7 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w8_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) { const int width = 8; @@ -626,6 +619,75 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } +static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t multi_ref_line, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t (*filter)[4]) +{ + // const int width = 8; + + __m256i vwshuf01 = _mm256_setr_epi8( + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d + ); + + __m256i vwshuf23 = _mm256_setr_epi8( + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f + ); + + int ref_offset = MIN(delta_int[0], delta_int[7]); + const __m256i v32s = _mm256_set1_epi16(32); + + // Load weights + __m128i tmp = _mm_load_si128((__m128i*)delta_fract); + __m256i vidxw = _mm256_cvtepi16_epi32(tmp); + __m256i vweights = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidxw, 4); + + __m256i vw01 = _mm256_shuffle_epi8(vweights, vwshuf01); + __m256i vw23 = _mm256_shuffle_epi8(vweights, vwshuf23); + + vw01 = _mm256_permute4x64_epi64(vw01, _MM_SHUFFLE(3, 1, 2, 0)); + vw23 = _mm256_permute4x64_epi64(vw23, _MM_SHUFFLE(3, 1, 2, 0)); + + /*tmp = _mm_load_si128((__m128i*)delta_int); + __m256i vidx = _mm256_cvtepi16_epi32(tmp);*/ + + const int mode_idx = pred_mode <= 34 ? pred_mode + 12 : 80 - pred_mode; // Considers also wide angle modes. + const int table_offset = mode_idx * 192 + multi_ref_line * 64; + + const __m256i vpshuf01 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w8_hor[table_offset + 0]); + const __m256i vpshuf23 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w8_hor[table_offset + 32]); + + // 4-tap interpolation filtering. + // For a 8 width block, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + // Load samples and shuffle into place + __m128i vp = _mm_loadu_si128((__m128i*)&ref_main[y + ref_offset]); + __m256i vp256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vp), vp, 1); + //__m256i vp0 = _mm256_i32gather_epi32((const int*)&ref_main[y + 0], vidx, 1); + //__m256i vp1 = _mm256_i32gather_epi32((const int*)&ref_main[y + 1], vidx, 1); + __m256i vp01 = _mm256_shuffle_epi8(vp256, vpshuf01); + __m256i vp23 = _mm256_shuffle_epi8(vp256, vpshuf23); + + __m256i dot_01 = _mm256_maddubs_epi16(vp01, vw01); + __m256i dot_23 = _mm256_maddubs_epi16(vp23, vw23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, v32s); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); + //filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm_store_si128((__m128i*)dst, packed); + + dst += 16; + } +} + static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) { int8_t f[64][4] = { { 0 } }; @@ -4258,7 +4320,13 @@ static void uvg_angular_pred_avx2( angular_pred_w4_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); break; - case 8: angular_pred_w8_hor_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 8: + if (pred_mode < -2) + angular_pred_w8_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); + else + angular_pred_w8_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); + + break; case 16: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; case 32: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; case 64: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 7a3eda94..e3b53454 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -791,6 +791,292 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, }; +// Intra interpolation shuffle vectors for luma w8 horizontal. +ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { + 0x00, 0x01, 0x10, 0x11, 0x20, 0x21, 0x30, 0x31, 0x40, 0x41, 0x50, 0x51, 0x60, 0x61, 0x70, 0x71, 0x01, 0x02, 0x11, 0x12, 0x21, 0x22, 0x31, 0x32, 0x41, 0x42, 0x51, 0x52, 0x61, 0x62, 0x71, 0x72, // Mode -12 mrl 0 + 0x02, 0x03, 0x12, 0x13, 0x22, 0x23, 0x32, 0x33, 0x42, 0x43, 0x52, 0x53, 0x62, 0x63, 0x72, 0x73, 0x03, 0x04, 0x13, 0x14, 0x23, 0x24, 0x33, 0x34, 0x43, 0x44, 0x53, 0x54, 0x63, 0x64, 0x73, 0x74, + 0x00, 0x01, 0x10, 0x11, 0x20, 0x21, 0x30, 0x31, 0x40, 0x41, 0x50, 0x51, 0x60, 0x61, 0x70, 0x71, 0x01, 0x02, 0x11, 0x12, 0x21, 0x22, 0x31, 0x32, 0x41, 0x42, 0x51, 0x52, 0x61, 0x62, 0x71, 0x72, // mrl 1 + 0x02, 0x03, 0x12, 0x13, 0x22, 0x23, 0x32, 0x33, 0x42, 0x43, 0x52, 0x53, 0x62, 0x63, 0x72, 0x73, 0x03, 0x04, 0x13, 0x14, 0x23, 0x24, 0x33, 0x34, 0x43, 0x44, 0x53, 0x54, 0x63, 0x64, 0x73, 0x74, + 0x00, 0x01, 0x10, 0x11, 0x20, 0x21, 0x30, 0x31, 0x40, 0x41, 0x50, 0x51, 0x60, 0x61, 0x70, 0x71, 0x01, 0x02, 0x11, 0x12, 0x21, 0x22, 0x31, 0x32, 0x41, 0x42, 0x51, 0x52, 0x61, 0x62, 0x71, 0x72, // mrl 2 + 0x02, 0x03, 0x12, 0x13, 0x22, 0x23, 0x32, 0x33, 0x42, 0x43, 0x52, 0x53, 0x62, 0x63, 0x72, 0x73, 0x03, 0x04, 0x13, 0x14, 0x23, 0x24, 0x33, 0x34, 0x43, 0x44, 0x53, 0x54, 0x63, 0x64, 0x73, 0x74, + 0x00, 0x01, 0x0b, 0x0c, 0x15, 0x16, 0x20, 0x21, 0x2b, 0x2c, 0x35, 0x36, 0x40, 0x41, 0x4b, 0x4c, 0x01, 0x02, 0x0c, 0x0d, 0x16, 0x17, 0x21, 0x22, 0x2c, 0x2d, 0x36, 0x37, 0x41, 0x42, 0x4c, 0x4d, // Mode -11 mrl 0 + 0x02, 0x03, 0x0d, 0x0e, 0x17, 0x18, 0x22, 0x23, 0x2d, 0x2e, 0x37, 0x38, 0x42, 0x43, 0x4d, 0x4e, 0x03, 0x04, 0x0e, 0x0f, 0x18, 0x19, 0x23, 0x24, 0x2e, 0x2f, 0x38, 0x39, 0x43, 0x44, 0x4e, 0x4f, + 0x00, 0x01, 0x0a, 0x0b, 0x15, 0x16, 0x20, 0x21, 0x2a, 0x2b, 0x35, 0x36, 0x40, 0x41, 0x4a, 0x4b, 0x01, 0x02, 0x0b, 0x0c, 0x16, 0x17, 0x21, 0x22, 0x2b, 0x2c, 0x36, 0x37, 0x41, 0x42, 0x4b, 0x4c, // mrl 1 + 0x02, 0x03, 0x0c, 0x0d, 0x17, 0x18, 0x22, 0x23, 0x2c, 0x2d, 0x37, 0x38, 0x42, 0x43, 0x4c, 0x4d, 0x03, 0x04, 0x0d, 0x0e, 0x18, 0x19, 0x23, 0x24, 0x2d, 0x2e, 0x38, 0x39, 0x43, 0x44, 0x4d, 0x4e, + 0x00, 0x01, 0x0b, 0x0c, 0x16, 0x17, 0x20, 0x21, 0x2b, 0x2c, 0x36, 0x37, 0x40, 0x41, 0x4b, 0x4c, 0x01, 0x02, 0x0c, 0x0d, 0x17, 0x18, 0x21, 0x22, 0x2c, 0x2d, 0x37, 0x38, 0x41, 0x42, 0x4c, 0x4d, // mrl 2 + 0x02, 0x03, 0x0d, 0x0e, 0x18, 0x19, 0x22, 0x23, 0x2d, 0x2e, 0x38, 0x39, 0x42, 0x43, 0x4d, 0x4e, 0x03, 0x04, 0x0e, 0x0f, 0x19, 0x1a, 0x23, 0x24, 0x2e, 0x2f, 0x39, 0x3a, 0x43, 0x44, 0x4e, 0x4f, + 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19, 0x20, 0x21, 0x28, 0x29, 0x30, 0x31, 0x38, 0x39, 0x01, 0x02, 0x09, 0x0a, 0x11, 0x12, 0x19, 0x1a, 0x21, 0x22, 0x29, 0x2a, 0x31, 0x32, 0x39, 0x3a, // Mode -10 mrl 0 + 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b, 0x22, 0x23, 0x2a, 0x2b, 0x32, 0x33, 0x3a, 0x3b, 0x03, 0x04, 0x0b, 0x0c, 0x13, 0x14, 0x1b, 0x1c, 0x23, 0x24, 0x2b, 0x2c, 0x33, 0x34, 0x3b, 0x3c, + 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19, 0x20, 0x21, 0x28, 0x29, 0x30, 0x31, 0x38, 0x39, 0x01, 0x02, 0x09, 0x0a, 0x11, 0x12, 0x19, 0x1a, 0x21, 0x22, 0x29, 0x2a, 0x31, 0x32, 0x39, 0x3a, // mrl 1 + 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b, 0x22, 0x23, 0x2a, 0x2b, 0x32, 0x33, 0x3a, 0x3b, 0x03, 0x04, 0x0b, 0x0c, 0x13, 0x14, 0x1b, 0x1c, 0x23, 0x24, 0x2b, 0x2c, 0x33, 0x34, 0x3b, 0x3c, + 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19, 0x20, 0x21, 0x28, 0x29, 0x30, 0x31, 0x38, 0x39, 0x01, 0x02, 0x09, 0x0a, 0x11, 0x12, 0x19, 0x1a, 0x21, 0x22, 0x29, 0x2a, 0x31, 0x32, 0x39, 0x3a, // mrl 2 + 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b, 0x22, 0x23, 0x2a, 0x2b, 0x32, 0x33, 0x3a, 0x3b, 0x03, 0x04, 0x0b, 0x0c, 0x13, 0x14, 0x1b, 0x1c, 0x23, 0x24, 0x2b, 0x2c, 0x33, 0x34, 0x3b, 0x3c, + 0x00, 0x01, 0x05, 0x06, 0x0b, 0x0c, 0x10, 0x11, 0x15, 0x16, 0x1b, 0x1c, 0x20, 0x21, 0x25, 0x26, 0x01, 0x02, 0x06, 0x07, 0x0c, 0x0d, 0x11, 0x12, 0x16, 0x17, 0x1c, 0x1d, 0x21, 0x22, 0x26, 0x27, // Mode -9 mrl 0 + 0x02, 0x03, 0x07, 0x08, 0x0d, 0x0e, 0x12, 0x13, 0x17, 0x18, 0x1d, 0x1e, 0x22, 0x23, 0x27, 0x28, 0x03, 0x04, 0x08, 0x09, 0x0e, 0x0f, 0x13, 0x14, 0x18, 0x19, 0x1e, 0x1f, 0x23, 0x24, 0x28, 0x29, + 0x00, 0x01, 0x06, 0x07, 0x0b, 0x0c, 0x10, 0x11, 0x16, 0x17, 0x1b, 0x1c, 0x20, 0x21, 0x26, 0x27, 0x01, 0x02, 0x07, 0x08, 0x0c, 0x0d, 0x11, 0x12, 0x17, 0x18, 0x1c, 0x1d, 0x21, 0x22, 0x27, 0x28, // mrl 1 + 0x02, 0x03, 0x08, 0x09, 0x0d, 0x0e, 0x12, 0x13, 0x18, 0x19, 0x1d, 0x1e, 0x22, 0x23, 0x28, 0x29, 0x03, 0x04, 0x09, 0x0a, 0x0e, 0x0f, 0x13, 0x14, 0x19, 0x1a, 0x1e, 0x1f, 0x23, 0x24, 0x29, 0x2a, + 0x00, 0x01, 0x05, 0x06, 0x0a, 0x0b, 0x10, 0x11, 0x15, 0x16, 0x1a, 0x1b, 0x20, 0x21, 0x25, 0x26, 0x01, 0x02, 0x06, 0x07, 0x0b, 0x0c, 0x11, 0x12, 0x16, 0x17, 0x1b, 0x1c, 0x21, 0x22, 0x26, 0x27, // mrl 2 + 0x02, 0x03, 0x07, 0x08, 0x0c, 0x0d, 0x12, 0x13, 0x17, 0x18, 0x1c, 0x1d, 0x22, 0x23, 0x27, 0x28, 0x03, 0x04, 0x08, 0x09, 0x0d, 0x0e, 0x13, 0x14, 0x18, 0x19, 0x1d, 0x1e, 0x23, 0x24, 0x28, 0x29, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d, 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e, // Mode -8 mrl 0 + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f, 0x03, 0x04, 0x07, 0x08, 0x0b, 0x0c, 0x0f, 0x10, 0x13, 0x14, 0x17, 0x18, 0x1b, 0x1c, 0x1f, 0x20, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d, 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e, // mrl 1 + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f, 0x03, 0x04, 0x07, 0x08, 0x0b, 0x0c, 0x0f, 0x10, 0x13, 0x14, 0x17, 0x18, 0x1b, 0x1c, 0x1f, 0x20, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d, 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e, // mrl 2 + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f, 0x03, 0x04, 0x07, 0x08, 0x0b, 0x0c, 0x0f, 0x10, 0x13, 0x14, 0x17, 0x18, 0x1b, 0x1c, 0x1f, 0x20, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x11, 0x12, 0x14, 0x15, 0x17, 0x18, // Mode -7 mrl 0 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x0f, 0x10, 0x13, 0x14, 0x16, 0x17, 0x19, 0x1a, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0e, 0x0f, 0x11, 0x12, 0x14, 0x15, 0x17, 0x18, // mrl 1 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0f, 0x10, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x19, 0x1a, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0b, 0x0c, 0x0e, 0x0f, 0x11, 0x12, 0x14, 0x15, 0x17, 0x18, // mrl 2 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0c, 0x0d, 0x0f, 0x10, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x19, 0x1a, + 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0c, 0x0d, 0x0f, 0x10, 0x11, 0x12, 0x14, 0x15, // Mode -6 mrl 0 + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x13, 0x15, 0x16, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x11, 0x12, 0x13, 0x14, 0x16, 0x17, + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x0e, 0x0f, 0x11, 0x12, 0x14, 0x15, // mrl 1 + 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x0f, 0x10, 0x12, 0x13, 0x15, 0x16, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x13, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x11, 0x12, 0x13, 0x14, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0c, 0x0d, 0x0f, 0x10, 0x12, 0x13, 0x14, 0x15, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x11, 0x12, // Mode -5 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x12, 0x13, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0f, 0x10, 0x11, 0x12, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x13, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x11, 0x12, 0x13, 0x14, + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, // mrl 2 + 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // Mode -4 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // Mode -3 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x10, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, // Mode -2 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x0d, 0x0e, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, // Mode -1 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, // Mode 0 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // Mode 1 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // Mode 2 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // Mode 3 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // Mode 4 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, // Mode 5 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, // Mode 6 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, 0x08, 0x09, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 7 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x07, 0x08, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // Mode 8 mrl 0 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // Mode 9 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode 10 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, // Mode 11 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, // Mode 12 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, // Mode 13 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, // Mode 14 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 15 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 16 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 17 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 18 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 19 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 20 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 21 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 22 mrl 0 + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 23 mrl 0 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 24 mrl 0 + 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // Mode 25 mrl 0 + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // Mode 26 mrl 0 + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x05, 0x06, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // Mode 27 mrl 0 + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, // mrl 1 + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // Mode 28 mrl 0 + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // Mode 29 mrl 0 + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // mrl 2 + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // Mode 30 mrl 0 + 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x00, 0x01, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, // Mode 31 mrl 0 + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, + 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x01, 0x02, 0x00, 0x01, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, + 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x08, 0x09, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // Mode 32 mrl 0 + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // Mode 33 mrl 0 + 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // Mode 34 mrl 0 + 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 1 + 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, + 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, 0x00, 0x01, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x01, 0x02, // mrl 2 + 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, +}; + // Chroma linear interpolation filter weights for width 8, vertical modes. These also work for w16 and w32. static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { From 75c1bd55e0f13574bea59e5eb6f5803c8cc4822f Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 9 Sep 2024 12:39:11 +0300 Subject: [PATCH 209/237] Improve intra horizontal w8 for wide angles. --- src/strategies/avx2/intra-avx2.c | 38 +++++++++----------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f40f5d2a..fca76695 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -566,44 +566,26 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w8_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w8_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) { const int width = 8; - int8_t f[8][4] = { { 0 } }; - if (use_cubic) { - memcpy(f[0], cubic_filter_8bit_c[delta_fract[0]], sizeof(int8_t) * 4); - memcpy(f[1], cubic_filter_8bit_c[delta_fract[1]], sizeof(int8_t) * 4); - memcpy(f[2], cubic_filter_8bit_c[delta_fract[2]], sizeof(int8_t) * 4); - memcpy(f[3], cubic_filter_8bit_c[delta_fract[3]], sizeof(int8_t) * 4); - memcpy(f[4], cubic_filter_8bit_c[delta_fract[4]], sizeof(int8_t) * 4); - memcpy(f[5], cubic_filter_8bit_c[delta_fract[5]], sizeof(int8_t) * 4); - memcpy(f[6], cubic_filter_8bit_c[delta_fract[6]], sizeof(int8_t) * 4); - memcpy(f[7], cubic_filter_8bit_c[delta_fract[7]], sizeof(int8_t) * 4); - } - else { - for (int x = 0; x < 8; ++x) { - const int8_t offset = (delta_fract[x] >> 1); - f[x][0] = 16 - offset; - f[x][1] = 32 - offset; - f[x][2] = 16 + offset; - f[x][3] = offset; - } - } - __m128i tmp = _mm_loadu_si128((__m128i*)delta_int); __m256i vidx = _mm256_cvtepi16_epi32(tmp); - __m256i weights = _mm256_loadu_si256((__m256i*)f); + // Load weights + tmp = _mm_load_si128((__m128i*)delta_fract); + __m256i vidxw = _mm256_cvtepi16_epi32(tmp); + __m256i vweights = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidxw, 4); for (int y = 0; y < height; y += 2) { // Do 4-tap intra interpolation filtering uvg_pixel* p = (uvg_pixel*)(ref_main + y); - __m256i vp0 = _mm256_i32gather_epi32((const int*)(p + 0), vidx, 1); - __m256i vp1 = _mm256_i32gather_epi32((const int*)(p + 1), vidx, 1); + __m256i vp0 = _mm256_i32gather_epi32((const int*)&ref_main[y + 0], vidx, 1); + __m256i vp1 = _mm256_i32gather_epi32((const int*)&ref_main[y + 1], vidx, 1); - __m256i dot_01 = _mm256_maddubs_epi16(vp0, weights); - __m256i dot_23 = _mm256_maddubs_epi16(vp1, weights); + __m256i dot_01 = _mm256_maddubs_epi16(vp0, vweights); + __m256i dot_23 = _mm256_maddubs_epi16(vp1, vweights); __m256i sum = _mm256_hadd_epi16(dot_01, dot_23); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); @@ -4322,7 +4304,7 @@ static void uvg_angular_pred_avx2( break; case 8: if (pred_mode < -2) - angular_pred_w8_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); + angular_pred_w8_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); else angular_pred_w8_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); From fc9fc526f1e70af2e4962594ba19ab4f5db106c2 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 16 Aug 2024 16:27:37 +0300 Subject: [PATCH 210/237] Add table generator script. Mark unused lines. --- src/strategies/avx2/intra_avx2_tables.h | 120 ++++++++++++------------ 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index e3b53454..8993dbf0 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -793,66 +793,66 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { // Intra interpolation shuffle vectors for luma w8 horizontal. ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { - 0x00, 0x01, 0x10, 0x11, 0x20, 0x21, 0x30, 0x31, 0x40, 0x41, 0x50, 0x51, 0x60, 0x61, 0x70, 0x71, 0x01, 0x02, 0x11, 0x12, 0x21, 0x22, 0x31, 0x32, 0x41, 0x42, 0x51, 0x52, 0x61, 0x62, 0x71, 0x72, // Mode -12 mrl 0 - 0x02, 0x03, 0x12, 0x13, 0x22, 0x23, 0x32, 0x33, 0x42, 0x43, 0x52, 0x53, 0x62, 0x63, 0x72, 0x73, 0x03, 0x04, 0x13, 0x14, 0x23, 0x24, 0x33, 0x34, 0x43, 0x44, 0x53, 0x54, 0x63, 0x64, 0x73, 0x74, - 0x00, 0x01, 0x10, 0x11, 0x20, 0x21, 0x30, 0x31, 0x40, 0x41, 0x50, 0x51, 0x60, 0x61, 0x70, 0x71, 0x01, 0x02, 0x11, 0x12, 0x21, 0x22, 0x31, 0x32, 0x41, 0x42, 0x51, 0x52, 0x61, 0x62, 0x71, 0x72, // mrl 1 - 0x02, 0x03, 0x12, 0x13, 0x22, 0x23, 0x32, 0x33, 0x42, 0x43, 0x52, 0x53, 0x62, 0x63, 0x72, 0x73, 0x03, 0x04, 0x13, 0x14, 0x23, 0x24, 0x33, 0x34, 0x43, 0x44, 0x53, 0x54, 0x63, 0x64, 0x73, 0x74, - 0x00, 0x01, 0x10, 0x11, 0x20, 0x21, 0x30, 0x31, 0x40, 0x41, 0x50, 0x51, 0x60, 0x61, 0x70, 0x71, 0x01, 0x02, 0x11, 0x12, 0x21, 0x22, 0x31, 0x32, 0x41, 0x42, 0x51, 0x52, 0x61, 0x62, 0x71, 0x72, // mrl 2 - 0x02, 0x03, 0x12, 0x13, 0x22, 0x23, 0x32, 0x33, 0x42, 0x43, 0x52, 0x53, 0x62, 0x63, 0x72, 0x73, 0x03, 0x04, 0x13, 0x14, 0x23, 0x24, 0x33, 0x34, 0x43, 0x44, 0x53, 0x54, 0x63, 0x64, 0x73, 0x74, - 0x00, 0x01, 0x0b, 0x0c, 0x15, 0x16, 0x20, 0x21, 0x2b, 0x2c, 0x35, 0x36, 0x40, 0x41, 0x4b, 0x4c, 0x01, 0x02, 0x0c, 0x0d, 0x16, 0x17, 0x21, 0x22, 0x2c, 0x2d, 0x36, 0x37, 0x41, 0x42, 0x4c, 0x4d, // Mode -11 mrl 0 - 0x02, 0x03, 0x0d, 0x0e, 0x17, 0x18, 0x22, 0x23, 0x2d, 0x2e, 0x37, 0x38, 0x42, 0x43, 0x4d, 0x4e, 0x03, 0x04, 0x0e, 0x0f, 0x18, 0x19, 0x23, 0x24, 0x2e, 0x2f, 0x38, 0x39, 0x43, 0x44, 0x4e, 0x4f, - 0x00, 0x01, 0x0a, 0x0b, 0x15, 0x16, 0x20, 0x21, 0x2a, 0x2b, 0x35, 0x36, 0x40, 0x41, 0x4a, 0x4b, 0x01, 0x02, 0x0b, 0x0c, 0x16, 0x17, 0x21, 0x22, 0x2b, 0x2c, 0x36, 0x37, 0x41, 0x42, 0x4b, 0x4c, // mrl 1 - 0x02, 0x03, 0x0c, 0x0d, 0x17, 0x18, 0x22, 0x23, 0x2c, 0x2d, 0x37, 0x38, 0x42, 0x43, 0x4c, 0x4d, 0x03, 0x04, 0x0d, 0x0e, 0x18, 0x19, 0x23, 0x24, 0x2d, 0x2e, 0x38, 0x39, 0x43, 0x44, 0x4d, 0x4e, - 0x00, 0x01, 0x0b, 0x0c, 0x16, 0x17, 0x20, 0x21, 0x2b, 0x2c, 0x36, 0x37, 0x40, 0x41, 0x4b, 0x4c, 0x01, 0x02, 0x0c, 0x0d, 0x17, 0x18, 0x21, 0x22, 0x2c, 0x2d, 0x37, 0x38, 0x41, 0x42, 0x4c, 0x4d, // mrl 2 - 0x02, 0x03, 0x0d, 0x0e, 0x18, 0x19, 0x22, 0x23, 0x2d, 0x2e, 0x38, 0x39, 0x42, 0x43, 0x4d, 0x4e, 0x03, 0x04, 0x0e, 0x0f, 0x19, 0x1a, 0x23, 0x24, 0x2e, 0x2f, 0x39, 0x3a, 0x43, 0x44, 0x4e, 0x4f, - 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19, 0x20, 0x21, 0x28, 0x29, 0x30, 0x31, 0x38, 0x39, 0x01, 0x02, 0x09, 0x0a, 0x11, 0x12, 0x19, 0x1a, 0x21, 0x22, 0x29, 0x2a, 0x31, 0x32, 0x39, 0x3a, // Mode -10 mrl 0 - 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b, 0x22, 0x23, 0x2a, 0x2b, 0x32, 0x33, 0x3a, 0x3b, 0x03, 0x04, 0x0b, 0x0c, 0x13, 0x14, 0x1b, 0x1c, 0x23, 0x24, 0x2b, 0x2c, 0x33, 0x34, 0x3b, 0x3c, - 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19, 0x20, 0x21, 0x28, 0x29, 0x30, 0x31, 0x38, 0x39, 0x01, 0x02, 0x09, 0x0a, 0x11, 0x12, 0x19, 0x1a, 0x21, 0x22, 0x29, 0x2a, 0x31, 0x32, 0x39, 0x3a, // mrl 1 - 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b, 0x22, 0x23, 0x2a, 0x2b, 0x32, 0x33, 0x3a, 0x3b, 0x03, 0x04, 0x0b, 0x0c, 0x13, 0x14, 0x1b, 0x1c, 0x23, 0x24, 0x2b, 0x2c, 0x33, 0x34, 0x3b, 0x3c, - 0x00, 0x01, 0x08, 0x09, 0x10, 0x11, 0x18, 0x19, 0x20, 0x21, 0x28, 0x29, 0x30, 0x31, 0x38, 0x39, 0x01, 0x02, 0x09, 0x0a, 0x11, 0x12, 0x19, 0x1a, 0x21, 0x22, 0x29, 0x2a, 0x31, 0x32, 0x39, 0x3a, // mrl 2 - 0x02, 0x03, 0x0a, 0x0b, 0x12, 0x13, 0x1a, 0x1b, 0x22, 0x23, 0x2a, 0x2b, 0x32, 0x33, 0x3a, 0x3b, 0x03, 0x04, 0x0b, 0x0c, 0x13, 0x14, 0x1b, 0x1c, 0x23, 0x24, 0x2b, 0x2c, 0x33, 0x34, 0x3b, 0x3c, - 0x00, 0x01, 0x05, 0x06, 0x0b, 0x0c, 0x10, 0x11, 0x15, 0x16, 0x1b, 0x1c, 0x20, 0x21, 0x25, 0x26, 0x01, 0x02, 0x06, 0x07, 0x0c, 0x0d, 0x11, 0x12, 0x16, 0x17, 0x1c, 0x1d, 0x21, 0x22, 0x26, 0x27, // Mode -9 mrl 0 - 0x02, 0x03, 0x07, 0x08, 0x0d, 0x0e, 0x12, 0x13, 0x17, 0x18, 0x1d, 0x1e, 0x22, 0x23, 0x27, 0x28, 0x03, 0x04, 0x08, 0x09, 0x0e, 0x0f, 0x13, 0x14, 0x18, 0x19, 0x1e, 0x1f, 0x23, 0x24, 0x28, 0x29, - 0x00, 0x01, 0x06, 0x07, 0x0b, 0x0c, 0x10, 0x11, 0x16, 0x17, 0x1b, 0x1c, 0x20, 0x21, 0x26, 0x27, 0x01, 0x02, 0x07, 0x08, 0x0c, 0x0d, 0x11, 0x12, 0x17, 0x18, 0x1c, 0x1d, 0x21, 0x22, 0x27, 0x28, // mrl 1 - 0x02, 0x03, 0x08, 0x09, 0x0d, 0x0e, 0x12, 0x13, 0x18, 0x19, 0x1d, 0x1e, 0x22, 0x23, 0x28, 0x29, 0x03, 0x04, 0x09, 0x0a, 0x0e, 0x0f, 0x13, 0x14, 0x19, 0x1a, 0x1e, 0x1f, 0x23, 0x24, 0x29, 0x2a, - 0x00, 0x01, 0x05, 0x06, 0x0a, 0x0b, 0x10, 0x11, 0x15, 0x16, 0x1a, 0x1b, 0x20, 0x21, 0x25, 0x26, 0x01, 0x02, 0x06, 0x07, 0x0b, 0x0c, 0x11, 0x12, 0x16, 0x17, 0x1b, 0x1c, 0x21, 0x22, 0x26, 0x27, // mrl 2 - 0x02, 0x03, 0x07, 0x08, 0x0c, 0x0d, 0x12, 0x13, 0x17, 0x18, 0x1c, 0x1d, 0x22, 0x23, 0x27, 0x28, 0x03, 0x04, 0x08, 0x09, 0x0d, 0x0e, 0x13, 0x14, 0x18, 0x19, 0x1d, 0x1e, 0x23, 0x24, 0x28, 0x29, - 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d, 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e, // Mode -8 mrl 0 - 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f, 0x03, 0x04, 0x07, 0x08, 0x0b, 0x0c, 0x0f, 0x10, 0x13, 0x14, 0x17, 0x18, 0x1b, 0x1c, 0x1f, 0x20, - 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d, 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e, // mrl 1 - 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f, 0x03, 0x04, 0x07, 0x08, 0x0b, 0x0c, 0x0f, 0x10, 0x13, 0x14, 0x17, 0x18, 0x1b, 0x1c, 0x1f, 0x20, - 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d, 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e, // mrl 2 - 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x12, 0x13, 0x16, 0x17, 0x1a, 0x1b, 0x1e, 0x1f, 0x03, 0x04, 0x07, 0x08, 0x0b, 0x0c, 0x0f, 0x10, 0x13, 0x14, 0x17, 0x18, 0x1b, 0x1c, 0x1f, 0x20, - 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x11, 0x12, 0x14, 0x15, 0x17, 0x18, // Mode -7 mrl 0 - 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x0f, 0x10, 0x13, 0x14, 0x16, 0x17, 0x19, 0x1a, - 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0e, 0x0f, 0x11, 0x12, 0x14, 0x15, 0x17, 0x18, // mrl 1 - 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0f, 0x10, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x19, 0x1a, - 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0b, 0x0c, 0x0e, 0x0f, 0x11, 0x12, 0x14, 0x15, 0x17, 0x18, // mrl 2 - 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0c, 0x0d, 0x0f, 0x10, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, 0x19, 0x1a, - 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0c, 0x0d, 0x0f, 0x10, 0x11, 0x12, 0x14, 0x15, // Mode -6 mrl 0 - 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x13, 0x15, 0x16, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x11, 0x12, 0x13, 0x14, 0x16, 0x17, - 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x0e, 0x0f, 0x11, 0x12, 0x14, 0x15, // mrl 1 - 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x0f, 0x10, 0x12, 0x13, 0x15, 0x16, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, 0x16, 0x17, - 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x13, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x11, 0x12, 0x13, 0x14, // mrl 2 - 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0c, 0x0d, 0x0f, 0x10, 0x12, 0x13, 0x14, 0x15, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x11, 0x12, // Mode -5 mrl 0 - 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x12, 0x13, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x13, 0x14, - 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0f, 0x10, 0x11, 0x12, // mrl 1 - 0x02, 0x03, 0x04, 0x05, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x13, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x11, 0x12, 0x13, 0x14, - 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, // mrl 2 - 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // Mode -4 mrl 0 - 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // mrl 1 - 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, // mrl 2 - 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // Mode -3 mrl 0 - 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // mrl 1 - 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, - 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, // mrl 2 - 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x10, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -12 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -11 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -10 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -9 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -8 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -7 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -6 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -5 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -4 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -3 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, // Mode -2 mrl 0 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, // mrl 1 From 3a91f8fe789bef5fb602774d2c226f71b2672743 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 19 Aug 2024 17:06:06 +0300 Subject: [PATCH 211/237] Improve intra horizontal w8. Chance don't care fields in tables from 0xff to 0x00. 0xff caused truncation warnings. --- src/strategies/avx2/intra-avx2.c | 67 +++- src/strategies/avx2/intra_avx2_tables.h | 466 +++++++++++++++++++----- 2 files changed, 436 insertions(+), 97 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index fca76695..d8408395 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -187,6 +187,11 @@ static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e ); +ALIGNED(32) static const uint8_t delta_fract_symmetry[] = { + 1, 32, 16, 32, 8, 16, 2, 16, 8, 16, 4, 16, 32, 16, 32, 1, + 32, 16, 32, 8, 16, 4, 16, 8, 16, 2, 16, 8, 16, 8, 32, 16, 32, 1 +}; + const __m256i w_shuf_01 = _mm256_setr_epi8( 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, @@ -633,9 +638,6 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, vw01 = _mm256_permute4x64_epi64(vw01, _MM_SHUFFLE(3, 1, 2, 0)); vw23 = _mm256_permute4x64_epi64(vw23, _MM_SHUFFLE(3, 1, 2, 0)); - /*tmp = _mm_load_si128((__m128i*)delta_int); - __m256i vidx = _mm256_cvtepi16_epi32(tmp);*/ - const int mode_idx = pred_mode <= 34 ? pred_mode + 12 : 80 - pred_mode; // Considers also wide angle modes. const int table_offset = mode_idx * 192 + multi_ref_line * 64; @@ -670,7 +672,7 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) +static void angular_pred_w16_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) { int8_t f[64][4] = { { 0 } }; if (use_cubic) { @@ -721,6 +723,51 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } +static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t multi_ref_line, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) +{ + const int width = 16; + const int ref_offset = MIN(delta_int[0], delta_int[15]); + const __m256i v32s = _mm256_set1_epi16(32); + + __m128i tmp0 = _mm_loadu_si128((__m128i*) &delta_fract[0]); + __m128i tmp1 = _mm_loadu_si128((__m128i*) &delta_fract[8]); + + __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); + __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); + + __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx0, 4); + __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx1, 4); + + const int mode_idx = pred_mode <= 34 ? pred_mode + 12 : 80 - pred_mode; // Considers also wide angle modes. + const int table_offset = mode_idx * 192 + multi_ref_line * 64; + + const __m256i vpshuf01 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w16_hor[table_offset + 0]); + const __m256i vpshuf23 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w16_hor[table_offset + 32]); + + // Width 16, handle one row at a time + for (int y = 0; y < height; ++y) { + // Do 4-tap intra interpolation filtering + __m128i vp = _mm_loadu_si128((__m128i*)&ref_main[y + ref_offset]); + __m256i vp256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vp), vp, 1); + + __m256i vp0 = _mm256_shuffle_epi8(vp256, vpshuf01); + __m256i vp1 = _mm256_shuffle_epi8(vp256, vpshuf23); + + __m256i vmadd0 = _mm256_maddubs_epi16(vp0, vw0); + __m256i vmadd1 = _mm256_maddubs_epi16(vp1, vw1); + __m256i sum = _mm256_hadd_epi16(vmadd0, vmadd1); + sum = _mm256_add_epi16(sum, v32s); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); + packed = _mm_shuffle_epi32(packed, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm_store_si128((__m128i*)(dst + (y * width)), packed); + } +} + static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { @@ -4309,9 +4356,15 @@ static void uvg_angular_pred_avx2( angular_pred_w8_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); break; - case 16: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 32: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 64: angular_pred_w16_hor_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 16: + if (pred_mode < 5 || pred_mode == 33) + angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); + else + angular_pred_w16_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); + + break; + case 32: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 64: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 8993dbf0..60ac356b 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -507,36 +507,36 @@ static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vector // Intra interpolation shuffle vectors for luma w4 horizontal. Includes wide angle modes [-12, 1]. Wide angle numbering goes from -12 to 1 since planar and DC (0, 1) are not considered angular modes. ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -12 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -11 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -10 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -9 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -8 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -12 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -11 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -10 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -9 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -8 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, // Mode -7 mrl 0 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x0d, 0x0e, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x0e, 0x0f, 0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x01, 0x02, 0x04, 0x05, 0x07, 0x08, 0x0a, 0x0b, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0b, 0x0c, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0c, 0x0d, // mrl 1 @@ -793,66 +793,66 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { // Intra interpolation shuffle vectors for luma w8 horizontal. ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -12 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -11 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -10 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -9 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -8 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -7 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -6 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -5 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -4 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode -3 | not used - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -12 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -11 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -10 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -9 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -8 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -7 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -6 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -5 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -4 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -3 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0b, 0x0c, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, // Mode -2 mrl 0 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0e, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0a, 0x0b, 0x0c, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0d, // mrl 1 @@ -1077,6 +1077,292 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, }; +// Intra interpolation shuffle vectors for luma w16 horizontal. +ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w16_hor[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -12 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -11 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -10 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -9 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -8 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -7 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -6 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -5 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -4 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -3 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -2 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -1 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 0 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 1 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 2 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 3 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 4 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // Mode 5 mrl 0 + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // mrl 1 + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // mrl 2 + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // Mode 6 mrl 0 + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 1 + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // mrl 2 + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // Mode 7 mrl 0 + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // mrl 1 + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 2 + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // Mode 8 mrl 0 + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // mrl 2 + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // Mode 9 mrl 0 + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // mrl 2 + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // Mode 10 mrl 0 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // Mode 11 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 1 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // mrl 2 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, // Mode 12 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 13 mrl 0 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 + 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, // Mode 14 mrl 0 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 15 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 16 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 17 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 18 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 19 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 20 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 21 mrl 0 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 + 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 22 mrl 0 + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, // mrl 1 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 23 mrl 0 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // Mode 24 mrl 0 + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // Mode 25 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // Mode 26 mrl 0 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, // mrl 1 + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 2 + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // Mode 27 mrl 0 + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 1 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, // mrl 2 + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // Mode 28 mrl 0 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, // mrl 1 + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 2 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, // Mode 29 mrl 0 + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, // mrl 1 + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // mrl 2 + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // Mode 30 mrl 0 + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, // mrl 1 + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, // mrl 2 + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, // Mode 31 mrl 0 + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 1 + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, // mrl 2 + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // Mode 32 mrl 0 + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 1 + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 2 + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, // Mode 33 mrl 0 + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, // mrl 1 + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, // mrl 2 + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0f, 0x10, 0x11, 0x12, 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, // Mode 34 mrl 0 + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0f, 0x10, 0x11, 0x12, 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, // mrl 1 + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0f, 0x10, 0x11, 0x12, 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, // mrl 2 + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, +}; + // Chroma linear interpolation filter weights for width 8, vertical modes. These also work for w16 and w32. static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_weights_w8_ver[] = { From 38697a649f5ed0be9164ec47e501811f503457ed Mon Sep 17 00:00:00 2001 From: Kari Siivonen Date: Mon, 9 Sep 2024 12:57:55 +0300 Subject: [PATCH 212/237] Improve intra horizontal w16, w32 and w64 for high --- src/strategies/avx2/intra-avx2.c | 97 +++++++++++++++----------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index d8408395..62688a0d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -494,9 +494,9 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* vp_01 = _mm256_shuffle_epi32(vp_01, _MM_SHUFFLE(3, 1, 2, 0)); vp_23 = _mm256_shuffle_epi32(vp_23, _MM_SHUFFLE(3, 1, 2, 0)); - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); + __m256i vmadd01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i vmadd23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(vmadd01, vmadd23); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); @@ -556,9 +556,9 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i vp_01 = _mm256_shuffle_epi8(vp, vpshuf0); __m256i vp_23 = _mm256_shuffle_epi8(vp, vpshuf1); - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); + __m256i vmadd01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i vmadd23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(vmadd01, vmadd23); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); @@ -589,18 +589,18 @@ static void angular_pred_w8_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* __m256i vp0 = _mm256_i32gather_epi32((const int*)&ref_main[y + 0], vidx, 1); __m256i vp1 = _mm256_i32gather_epi32((const int*)&ref_main[y + 1], vidx, 1); - __m256i dot_01 = _mm256_maddubs_epi16(vp0, vweights); - __m256i dot_23 = _mm256_maddubs_epi16(vp1, vweights); - __m256i sum = _mm256_hadd_epi16(dot_01, dot_23); + __m256i vmadd0 = _mm256_maddubs_epi16(vp0, vweights); + __m256i vmadd1 = _mm256_maddubs_epi16(vp1, vweights); + __m256i sum = _mm256_hadd_epi16(vmadd0, vmadd1); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); __m128i lo = _mm256_castsi256_si128(sum); __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i packed = _mm_packus_epi16(lo, hi); + packed = _mm_shuffle_epi32(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm_store_si128((__m128i*)dst, filtered); + _mm_store_si128((__m128i*)dst, packed); dst += 16; } @@ -650,21 +650,19 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, // Load samples and shuffle into place __m128i vp = _mm_loadu_si128((__m128i*)&ref_main[y + ref_offset]); __m256i vp256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vp), vp, 1); - //__m256i vp0 = _mm256_i32gather_epi32((const int*)&ref_main[y + 0], vidx, 1); - //__m256i vp1 = _mm256_i32gather_epi32((const int*)&ref_main[y + 1], vidx, 1); + __m256i vp01 = _mm256_shuffle_epi8(vp256, vpshuf01); __m256i vp23 = _mm256_shuffle_epi8(vp256, vpshuf23); - __m256i dot_01 = _mm256_maddubs_epi16(vp01, vw01); - __m256i dot_23 = _mm256_maddubs_epi16(vp23, vw23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); + __m256i vmadd01 = _mm256_maddubs_epi16(vp01, vw01); + __m256i vmadd23 = _mm256_maddubs_epi16(vp23, vw23); + __m256i sum = _mm256_add_epi16(vmadd01, vmadd23); sum = _mm256_add_epi16(sum, v32s); sum = _mm256_srai_epi16(sum, 6); __m128i lo = _mm256_castsi256_si128(sum); __m128i hi = _mm256_extracti128_si256(sum, 1); __m128i packed = _mm_packus_epi16(lo, hi); - //filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); _mm_store_si128((__m128i*)dst, packed); @@ -672,53 +670,45 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w16_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) +static void angular_pred_w16_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int8_t(*filter)[4]) { - int8_t f[64][4] = { { 0 } }; - if (use_cubic) { - for (int x = 0; x < width; ++x) { - memcpy(f[x], cubic_filter_8bit_c[delta_fract[x]], sizeof(int8_t) * 4); - } - } - else { - for (int x = 0; x < width; ++x) { - const int8_t offset = (delta_fract[x] >> 1); - f[x][0] = 16 - offset; - f[x][1] = 32 - offset; - f[x][2] = 16 + offset; - f[x][3] = offset; - } - } + __m256i vw0[4]; + __m256i vw1[4]; + for (int x = 0, i = 0; x < width; x += 16, ++i) { + __m128i tmp0 = _mm_loadu_si128((__m128i*) &delta_fract[x + 0]); + __m128i tmp1 = _mm_loadu_si128((__m128i*) &delta_fract[x + 8]); - for (int x = 0; x < width; x += 16) { - __m128i tmp0 = _mm_loadu_si128((__m128i*)&delta_int[x]); - __m128i tmp1 = _mm_loadu_si128((__m128i*)&delta_int[x + 8]); __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); - __m256i w0 = _mm256_loadu_si256((__m256i*) & f[x + 0]); - __m256i w1 = _mm256_loadu_si256((__m256i*) & f[x + 8]); + vw0[i] = _mm256_i32gather_epi32((const int32_t*)filter, vidx0, 4); + vw1[i] = _mm256_i32gather_epi32((const int32_t*)filter, vidx1, 4); + } + + for (int x = 0, vi = 0; x < width; x += 16, ++vi) { + __m128i tmp0 = _mm_load_si128((__m128i*)&delta_int[x]); + __m128i tmp1 = _mm_load_si128((__m128i*)&delta_int[x + 8]); + __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); + __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); // Width 16, handle one row at a time for (int y = 0; y < height; ++y) { - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)(ref_main + y); - __m256i vp0 = _mm256_i32gather_epi32((const int*)p, vidx0, 1); - __m256i vp1 = _mm256_i32gather_epi32((const int*)p, vidx1, 1); + __m256i vp0 = _mm256_i32gather_epi32((const int*)&ref_main[y], vidx0, 1); + __m256i vp1 = _mm256_i32gather_epi32((const int*)&ref_main[y], vidx1, 1); - __m256i dot_01 = _mm256_maddubs_epi16(vp0, w0); - __m256i dot_23 = _mm256_maddubs_epi16(vp1, w1); - __m256i sum = _mm256_hadd_epi16(dot_01, dot_23); + __m256i vmadd0 = _mm256_maddubs_epi16(vp0, vw0[vi]); + __m256i vmadd1 = _mm256_maddubs_epi16(vp1, vw1[vi]); + __m256i sum = _mm256_hadd_epi16(vmadd0, vmadd1); sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); sum = _mm256_srai_epi16(sum, 6); __m128i lo = _mm256_castsi256_si128(sum); __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - filtered = _mm_shuffle_epi32(filtered, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i packed = _mm_packus_epi16(lo, hi); + packed = _mm_shuffle_epi32(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm_store_si128((__m128i*)(dst + (y * width + x)), filtered); + _mm_store_si128((__m128i*)(dst + (y * width + x)), packed); } } } @@ -764,7 +754,8 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i packed = _mm_packus_epi16(lo, hi); packed = _mm_shuffle_epi32(packed, _MM_SHUFFLE(3, 1, 2, 0)); - _mm_store_si128((__m128i*)(dst + (y * width)), packed); + _mm_store_si128((__m128i*)dst, packed); + dst += 16; } } @@ -4358,13 +4349,13 @@ static void uvg_angular_pred_avx2( break; case 16: if (pred_mode < 5 || pred_mode == 33) - angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); + angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); else angular_pred_w16_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); break; - case 32: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 64: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 32: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; + case 64: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; From f9bc8f7f13167508561c8b656fbb10cc072362e6 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 20 Aug 2024 14:06:46 +0300 Subject: [PATCH 213/237] Update table generator scripts. --- src/strategies/avx2/intra_avx2_tables.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 60ac356b..b7681444 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1349,18 +1349,18 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w16_hor[] = { 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 2 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, - 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, // Mode 33 mrl 0 - 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, - 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, // mrl 1 - 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, - 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, // mrl 2 - 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, - 0x0f, 0x10, 0x11, 0x12, 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, // Mode 34 mrl 0 - 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, - 0x0f, 0x10, 0x11, 0x12, 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, // mrl 1 - 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, - 0x0f, 0x10, 0x11, 0x12, 0x0e, 0x0f, 0x10, 0x11, 0x0d, 0x0e, 0x0f, 0x10, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, // mrl 2 - 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 33 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 34 | not used + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; From d82e53714e33e91dd408593ad0e21ab993b87bb9 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 22 Aug 2024 12:33:41 +0300 Subject: [PATCH 214/237] Replace w16 horizontal shuffle table with w64 table. w16 and w32 shuffles can be found in the 264 table. --- src/strategies/avx2/intra_avx2_tables.h | 623 ++++++++++++++++++++---- 1 file changed, 519 insertions(+), 104 deletions(-) diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index b7681444..ef955369 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -1077,290 +1077,705 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, 0x02, 0x03, 0x0a, 0x0b, 0x09, 0x0a, 0x08, 0x09, 0x07, 0x08, 0x06, 0x07, 0x05, 0x06, 0x04, 0x05, 0x03, 0x04, }; -// Intra interpolation shuffle vectors for luma w16 horizontal. -ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w16_hor[] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -12 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -11 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -10 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -9 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -8 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -7 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -6 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -5 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -4 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -3 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -2 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -1 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 0 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 1 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 2 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 3 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 4 | not used - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +// Intra interpolation shuffle vectors for luma w64 horizontal. +// w16 and w32 functions can also use values in this table. +ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w64_hor[] = { 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // Mode 5 mrl 0 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // mrl 1 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, 0x0b, 0x0c, 0x0d, 0x0e, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // mrl 2 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // Mode 6 mrl 0 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 1 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, // mrl 2 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // Mode 7 mrl 0 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // mrl 1 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 2 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // Mode 8 mrl 0 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, // mrl 2 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // Mode 9 mrl 0 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // mrl 2 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // Mode 10 mrl 0 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // Mode 11 mrl 0 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 1 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // mrl 2 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, // Mode 12 mrl 0 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 1 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 13 mrl 0 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, // Mode 14 mrl 0 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 15 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 16 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 17 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 18 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 19 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 1 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // mrl 2 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, // Mode 20 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 21 mrl 0 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 1 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 22 mrl 0 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, // mrl 1 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // mrl 2 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, // Mode 23 mrl 0 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 1 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // Mode 24 mrl 0 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, // mrl 1 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // mrl 2 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, // Mode 25 mrl 0 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 1 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, // mrl 2 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // Mode 26 mrl 0 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, // mrl 1 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // mrl 2 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, // Mode 27 mrl 0 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 1 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, // mrl 2 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, + 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // Mode 28 mrl 0 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, // mrl 1 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, // mrl 2 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, // Mode 29 mrl 0 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, // mrl 1 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // mrl 2 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // Mode 30 mrl 0 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, // mrl 1 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, // mrl 2 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, // Mode 31 mrl 0 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 1 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, // mrl 2 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // Mode 32 mrl 0 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 1 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, // mrl 2 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x0b, 0x0c, 0x0d, 0x0e, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x09, 0x0a, 0x0b, 0x0c, 0x08, 0x09, 0x0a, 0x0b, 0x07, 0x08, 0x09, 0x0a, 0x06, 0x07, 0x08, 0x09, + 0x06, 0x07, 0x08, 0x09, 0x05, 0x06, 0x07, 0x08, 0x04, 0x05, 0x06, 0x07, 0x03, 0x04, 0x05, 0x06, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x01, 0x02, 0x03, 0x04, 0x00, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 33 | not used 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode 34 | not used 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; From 999cd2c1ce894b26526881083204518188af912f Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 22 Aug 2024 12:34:05 +0300 Subject: [PATCH 215/237] tmp commit --- src/strategies/avx2/intra-avx2.c | 79 +++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 62688a0d..b7df1d28 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -728,11 +728,12 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx0, 4); __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx1, 4); - const int mode_idx = pred_mode <= 34 ? pred_mode + 12 : 80 - pred_mode; // Considers also wide angle modes. - const int table_offset = mode_idx * 192 + multi_ref_line * 64; + // Unused modes are pruned from the table and it starts from mode 5. Offset mode 5 to zero index. + const int mode_idx = pred_mode - 5; + const int table_offset = mode_idx * 768 + multi_ref_line * 256; // mode_idx * (3 * 256) + mrl * 256 - const __m256i vpshuf01 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w16_hor[table_offset + 0]); - const __m256i vpshuf23 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w16_hor[table_offset + 32]); + const __m256i vpshuf0 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[table_offset + 0]); + const __m256i vpshuf1 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[table_offset + 32]); // Width 16, handle one row at a time for (int y = 0; y < height; ++y) { @@ -740,8 +741,8 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, __m128i vp = _mm_loadu_si128((__m128i*)&ref_main[y + ref_offset]); __m256i vp256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vp), vp, 1); - __m256i vp0 = _mm256_shuffle_epi8(vp256, vpshuf01); - __m256i vp1 = _mm256_shuffle_epi8(vp256, vpshuf23); + __m256i vp0 = _mm256_shuffle_epi8(vp256, vpshuf0); + __m256i vp1 = _mm256_shuffle_epi8(vp256, vpshuf1); __m256i vmadd0 = _mm256_maddubs_epi16(vp0, vw0); __m256i vmadd1 = _mm256_maddubs_epi16(vp1, vw1); @@ -759,6 +760,64 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } +static void angular_pred_w32_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t multi_ref_line, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int8_t(*filter)[4]) +{ + // const int width = 32; + const __m256i v32s = _mm256_set1_epi16(32); + + // Unused modes are pruned from the table and it starts from mode 5. Offset mode 5 to zero index. + const int mode_idx = pred_mode - 5; + const int table_offset = mode_idx * 768 + multi_ref_line * 256; // mode_idx * (3 * 256) + mrl * 256 + + /*__m256i vw[8]; + __m256i vpshuf[8]; + for (int x = 0, vid = 0, table = 0; x < width; x += 8, ++vid, table += 32) { + __m128i tmp = _mm_loadu_si128((__m128i*) &delta_fract[x]); + __m256i vidx = _mm256_cvtepi16_epi32(tmp); + vw[vid] = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx, 4); + vpshuf[vid] = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[table_offset + table]); + }*/ + + for (int x = 0, shuf = table_offset; x < width; x += 16, shuf += 64) { + const int ref_offset = MIN(delta_int[x], delta_int[x + 15]); + + __m128i tmp0 = _mm_load_si128((__m128i*)&delta_fract[x]); + __m128i tmp1 = _mm_load_si128((__m128i*)&delta_fract[x + 8]); + + __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); + __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); + + __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx0, 4); + __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx1, 4); + + __m256i vpshuf0 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[shuf + 0]); + __m256i vpshuf1 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[shuf + 32]); + + // Width 16, handle one row at a time + for (int y = 0; y < height; ++y) { + // Do 4-tap intra interpolation filtering + __m128i vp = _mm_loadu_si128((__m128i*) &ref_main[y + ref_offset]); + __m256i vp256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vp), vp, 1); + + __m256i vp0 = _mm256_shuffle_epi8(vp256, vpshuf0); + __m256i vp1 = _mm256_shuffle_epi8(vp256, vpshuf1); + + __m256i vmadd0 = _mm256_maddubs_epi16(vp0, vw0); + __m256i vmadd1 = _mm256_maddubs_epi16(vp1, vw1); + __m256i sum = _mm256_hadd_epi16(vmadd0, vmadd1); + sum = _mm256_add_epi16(sum, v32s); + sum = _mm256_srai_epi16(sum, 6); + + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); + packed = _mm_shuffle_epi32(packed, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm_store_si128((__m128i*)(dst + (y * width + x)), packed); + } + } +} + static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) { @@ -4354,7 +4413,13 @@ static void uvg_angular_pred_avx2( angular_pred_w16_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, height, pfilter); break; - case 32: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; + case 32: + if (pred_mode < 5 || pred_mode == 33) + angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); + else + angular_pred_w32_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, width, height, pfilter); + + break; case 64: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; default: assert(false && "Intra angular predicion: illegal width.\n"); From b32df3fe0f7a925c1a907631bbc217378c9759cb Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 22 Aug 2024 13:52:10 +0300 Subject: [PATCH 216/237] Use w32 horizontal for w64. w16 could be done with the same function, but it was slightly faster without the for loop overhead. --- src/strategies/avx2/intra-avx2.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index b7df1d28..184fded2 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -760,24 +760,15 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } +// Note: use this same function also for w64. w16 could use this, but it was slightly faster without the for loop overheads static void angular_pred_w32_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t pred_mode, const int16_t multi_ref_line, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int8_t(*filter)[4]) { - // const int width = 32; const __m256i v32s = _mm256_set1_epi16(32); // Unused modes are pruned from the table and it starts from mode 5. Offset mode 5 to zero index. const int mode_idx = pred_mode - 5; const int table_offset = mode_idx * 768 + multi_ref_line * 256; // mode_idx * (3 * 256) + mrl * 256 - /*__m256i vw[8]; - __m256i vpshuf[8]; - for (int x = 0, vid = 0, table = 0; x < width; x += 8, ++vid, table += 32) { - __m128i tmp = _mm_loadu_si128((__m128i*) &delta_fract[x]); - __m256i vidx = _mm256_cvtepi16_epi32(tmp); - vw[vid] = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx, 4); - vpshuf[vid] = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[table_offset + table]); - }*/ - for (int x = 0, shuf = table_offset; x < width; x += 16, shuf += 64) { const int ref_offset = MIN(delta_int[x], delta_int[x + 15]); @@ -4420,7 +4411,13 @@ static void uvg_angular_pred_avx2( angular_pred_w32_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, width, height, pfilter); break; - case 64: angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; + case 64: + if (pred_mode < 5 || pred_mode == 33) + angular_pred_w16_hor_high_angle_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); + else + angular_pred_w32_hor_avx2(dst, ref_main, pred_mode, multi_ref_index, delta_int, delta_fract, width, height, pfilter); + + break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; From 132093d71b974c009b3cc8675a3c46ba060e9ca4 Mon Sep 17 00:00:00 2001 From: Kari Siivonen Date: Mon, 9 Sep 2024 13:00:21 +0300 Subject: [PATCH 217/237] Improve intra vertical w4. --- src/strategies/avx2/intra-avx2.c | 92 +++++++++++++------------------- 1 file changed, 36 insertions(+), 56 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 184fded2..f956b864 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -169,7 +169,7 @@ ALIGNED(32) static const int8_t cubic_filter_8bit_g[32][4] = }; -static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) { const int width = 4; @@ -187,75 +187,55 @@ static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e ); -ALIGNED(32) static const uint8_t delta_fract_symmetry[] = { - 1, 32, 16, 32, 8, 16, 2, 16, 8, 16, 4, 16, 32, 16, 32, 1, - 32, 16, 32, 8, 16, 4, 16, 8, 16, 2, 16, 8, 16, 8, 32, 16, 32, 1 -}; - const __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d ); const __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, + 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f ); - - int16_t f[4][4] = { { 0 } }; + // Do 4-tap intra interpolation filtering // For a 4 width block, height must be at least 4. Handle 4 lines at once for (int y = 0; y < height; y += 4) { - if (use_cubic) { - memcpy(f[0], cubic_filter[delta_fract[y + 0]], 8); - memcpy(f[1], cubic_filter[delta_fract[y + 1]], 8); - memcpy(f[2], cubic_filter[delta_fract[y + 2]], 8); - memcpy(f[3], cubic_filter[delta_fract[y + 3]], 8); - } - else { - for (int yy = 0; yy < 4; ++yy) { - const int16_t offset = (delta_fract[y + yy] >> 1); - f[yy][0] = 16 - offset; - f[yy][1] = 32 - offset; - f[yy][2] = 16 + offset; - f[yy][3] = offset; - } - } + // Copy the filter to local memory + __m128i vdfract = _mm_loadu_si128((__m128i*)&delta_fract[y]); + __m128i vidxw = _mm_cvtepi16_epi32(vdfract); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidxw, 4); + + __m256i weights256 = _mm256_insertf128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); + + // Shuffle the interpolation weights into place. + __m256i w01 = _mm256_shuffle_epi8(weights256, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(weights256, w_shuf_23); - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)ref_main; // This solution assumes the delta int values to be 64-bit // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi64x(delta_int[y + 0], - delta_int[y + 1], - delta_int[y + 2], - delta_int[y + 3]); - __m256i all_weights = _mm256_loadu_si256((__m256i*)f); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); - - for (int_fast32_t x = 0; x + 3 < width; x += 4, p += 4) { + __m128i vdelta_int = _mm_loadu_si128((__m128i*)&delta_int[y]); + __m256i vidx = _mm256_cvtepi16_epi64(vdelta_int); - __m256i vp = _mm256_i64gather_epi64((const long long int*)p, vidx, 1); - __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); - __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); + __m256i vp = _mm256_i64gather_epi64((const long long int*)ref_main, vidx, 1); + __m256i vp_01 = _mm256_shuffle_epi8(vp, p_shuf_01); + __m256i vp_23 = _mm256_shuffle_epi8(vp, p_shuf_23); - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); + __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(dot_01, dot_23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); - _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); - } + _mm_store_si128((__m128i*)dst, packed); + dst += 16; } } @@ -4371,7 +4351,7 @@ static void uvg_angular_pred_avx2( if (channel_type == 0) { if (vertical_mode) { switch (width) { - case 4: angular_pred_w4_ver_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 4: angular_pred_w4_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; case 8: angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; case 16: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; case 32: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; From 44f0fe3c5c92702503cb78d964f54f5495f3a62f Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:02:53 +0300 Subject: [PATCH 218/237] Remove void cast. It was not necessary. --- src/strategies/avx2/intra-avx2.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f956b864..be9899a6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -450,7 +450,7 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* // Copy the filter to local memory __m128i vdfract = _mm_load_si128((__m128i*)delta_fract); __m128i vidx = _mm_cvtepi16_epi32(vdfract); - __m128i all_weights = _mm_i32gather_epi32((const int32_t*)(void*)filter, vidx, 4); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidx, 4); __m256i weights256 = _mm256_insertf128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); // Shuffle the interpolation weights into place. @@ -518,7 +518,7 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, // Copy the filter to local memory __m128i vdfract = _mm_load_si128((__m128i*)delta_fract); __m128i vidx = _mm_cvtepi16_epi32(vdfract); - __m128i all_weights = _mm_i32gather_epi32((const int32_t*)(void*)filter, vidx, 4); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidx, 4); __m256i weights256 = _mm256_insertf128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); @@ -560,7 +560,7 @@ static void angular_pred_w8_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* // Load weights tmp = _mm_load_si128((__m128i*)delta_fract); __m256i vidxw = _mm256_cvtepi16_epi32(tmp); - __m256i vweights = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidxw, 4); + __m256i vweights = _mm256_i32gather_epi32((const int32_t*)filter, vidxw, 4); for (int y = 0; y < height; y += 2) { @@ -610,7 +610,7 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, // Load weights __m128i tmp = _mm_load_si128((__m128i*)delta_fract); __m256i vidxw = _mm256_cvtepi16_epi32(tmp); - __m256i vweights = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidxw, 4); + __m256i vweights = _mm256_i32gather_epi32((const int32_t*)filter, vidxw, 4); __m256i vw01 = _mm256_shuffle_epi8(vweights, vwshuf01); __m256i vw23 = _mm256_shuffle_epi8(vweights, vwshuf23); @@ -705,8 +705,8 @@ static void angular_pred_w16_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); - __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx0, 4); - __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx1, 4); + __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)filter, vidx0, 4); + __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)filter, vidx1, 4); // Unused modes are pruned from the table and it starts from mode 5. Offset mode 5 to zero index. const int mode_idx = pred_mode - 5; @@ -758,8 +758,8 @@ static void angular_pred_w32_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); - __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx0, 4); - __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)(void*)filter, vidx1, 4); + __m256i vw0 = _mm256_i32gather_epi32((const int32_t*)filter, vidx0, 4); + __m256i vw1 = _mm256_i32gather_epi32((const int32_t*)filter, vidx1, 4); __m256i vpshuf0 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[shuf + 0]); __m256i vpshuf1 = _mm256_loadu_si256((__m256i*) &intra_luma_interpolation_shuffle_vectors_w64_hor[shuf + 32]); From 61ffd5d5c2b8f8aa789b6717d0cc5ec6e418eef7 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 22 Aug 2024 16:09:15 +0300 Subject: [PATCH 219/237] Improve intra vertical w8. --- src/strategies/avx2/intra-avx2.c | 73 +++++++++++++++++--------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index be9899a6..53dcfc78 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -239,9 +239,9 @@ static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int use_cubic) +static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) { - const int width = 8; + //const int width = 8; const __m128i p_shuf_01 = _mm_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -254,22 +254,23 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, ); const __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05 ); const __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 ); + // Do 4-tap intra interpolation filtering // For a 8 width block, height must be at least 2. Handle 2 lines at once for (int y = 0; y < height; y += 2) { - __m256i all_weights; + /*__m256i all_weights; if (use_cubic) { int16_t tmp[8]; memcpy(&tmp[0], cubic_filter[delta_fract[y + 0]], 4 * sizeof(int16_t)); @@ -287,37 +288,39 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, tmp[idx + 3] = offset; } all_weights = _mm256_setr_epi64x(*(int64_t*)&tmp[0], *(int64_t*)&tmp[4], *(int64_t*)&tmp[0], *(int64_t*)&tmp[4]); - } + }*/ - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)ref_main; + // Load and shuffle filter weights + __m128i vidxw = _mm_load_si128((__m128i*)&delta_fract[y]); + __m128i vidxw32 = _mm_cvtepi16_epi32(vidxw); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidxw32, 4); + __m256i aw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); - // Weights are 16-bit, but shuffle will cut out the unnecessary bits. - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + __m256i w01 = _mm256_shuffle_epi8(aw256, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(aw256, w_shuf_23); - for (int_fast32_t x = 0; x < width; x += 8, p += 8) { - __m128i vp0 = _mm_loadu_si128((__m128i*)(p + delta_int[y + 0])); - __m128i vp1 = _mm_loadu_si128((__m128i*)(p + delta_int[y + 1])); + // Load and shuffle reference pixels + __m128i vp0 = _mm_loadu_si128((__m128i*)(ref_main + delta_int[y + 0])); + __m128i vp1 = _mm_loadu_si128((__m128i*)(ref_main + delta_int[y + 1])); - __m256i vp_01 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_01)); - vp_01 = _mm256_inserti128_si256(vp_01, _mm_shuffle_epi8(vp1, p_shuf_01), 1); + __m256i vp_01 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_01)); + vp_01 = _mm256_inserti128_si256(vp_01, _mm_shuffle_epi8(vp1, p_shuf_01), 1); - __m256i vp_23 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_23)); - vp_23 = _mm256_inserti128_si256(vp_23, _mm_shuffle_epi8(vp1, p_shuf_23), 1); + __m256i vp_23 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_23)); + vp_23 = _mm256_inserti128_si256(vp_23, _mm_shuffle_epi8(vp1, p_shuf_23), 1); - __m256i dot_01 = _mm256_maddubs_epi16(vp_01, w01); - __m256i dot_23 = _mm256_maddubs_epi16(vp_23, w23); - __m256i sum = _mm256_add_epi16(dot_01, dot_23); - sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); - sum = _mm256_srai_epi16(sum, 6); + __m256i vmadd01 = _mm256_maddubs_epi16(vp_01, w01); + __m256i vmadd23 = _mm256_maddubs_epi16(vp_23, w23); + __m256i sum = _mm256_add_epi16(vmadd01, vmadd23); + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(32)); + sum = _mm256_srai_epi16(sum, 6); - __m128i lo = _mm256_castsi256_si128(sum); - __m128i hi = _mm256_extracti128_si256(sum, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); + __m128i lo = _mm256_castsi256_si128(sum); + __m128i hi = _mm256_extracti128_si256(sum, 1); + __m128i packed = _mm_packus_epi16(lo, hi); - _mm_store_si128((__m128i*)(dst + (y * 8)), filtered); - } + _mm_store_si128((__m128i*)dst, packed); + dst += 16; } } @@ -4352,7 +4355,7 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: angular_pred_w4_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; - case 8: angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, use_cubic); break; + case 8: angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; case 16: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; case 32: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; case 64: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; From b0410a593fb2a5967f2c9f3d33920af7f1be70f4 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 23 Aug 2024 11:30:26 +0300 Subject: [PATCH 220/237] Improve intra vertical w16. w32 and w64 can use this same function. --- src/strategies/avx2/intra-avx2.c | 89 ++++++++------------------------ 1 file changed, 21 insertions(+), 68 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 53dcfc78..8031b6ad 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -270,26 +270,7 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, // Do 4-tap intra interpolation filtering // For a 8 width block, height must be at least 2. Handle 2 lines at once for (int y = 0; y < height; y += 2) { - /*__m256i all_weights; - if (use_cubic) { - int16_t tmp[8]; - memcpy(&tmp[0], cubic_filter[delta_fract[y + 0]], 4 * sizeof(int16_t)); - memcpy(&tmp[4], cubic_filter[delta_fract[y + 1]], 4 * sizeof(int16_t)); - all_weights = _mm256_setr_epi64x(*(int64_t*)&tmp[0], *(int64_t*)&tmp[4], *(int64_t*)&tmp[0], *(int64_t*)&tmp[4]); - } - else { - int16_t tmp[8]; - for (int yy = 0; yy < 2; ++yy) { - const int16_t offset = (delta_fract[y + yy] >> 1); - const int idx = yy * 4; - tmp[idx + 0] = 16 - offset; - tmp[idx + 1] = 32 - offset; - tmp[idx + 2] = 16 + offset; - tmp[idx + 3] = offset; - } - all_weights = _mm256_setr_epi64x(*(int64_t*)&tmp[0], *(int64_t*)&tmp[4], *(int64_t*)&tmp[0], *(int64_t*)&tmp[4]); - }*/ - + // Load and shuffle filter weights __m128i vidxw = _mm_load_si128((__m128i*)&delta_fract[y]); __m128i vidxw32 = _mm_cvtepi16_epi32(vidxw); @@ -324,7 +305,7 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int use_cubic) +static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int8_t(*filter)[4]) { const __m256i p_shuf_01 = _mm256_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -341,60 +322,32 @@ static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, ); const __m256i w_shuf_01 = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 ); const __m256i w_shuf_23 = _mm256_setr_epi8( - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, - 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, 0x04, 0x06, - 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e, 0x0c, 0x0e + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03 ); - //int16_t f[4][4] = { { 0 } }; - + // Do 4-tap intra interpolation filtering // For a 16 width block, height can be 1. for (int y = 0; y < height; ++y) { - __m256i all_weights; - if (use_cubic) { - //memcpy(f[0], cubic_filter[delta_fract[y + 0]], 8); - //memcpy(f[1], cubic_filter[delta_fract[y + 1]], 8); - //memcpy(f[2], cubic_filter[delta_fract[y + 2]], 8); - //memcpy(f[3], cubic_filter[delta_fract[y + 3]], 8); - //int64_t *tmp = (int64_t*)&delta_fract[y]; - int16_t tmp[4]; - memcpy(&tmp, cubic_filter[delta_fract[y]], 8); - all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); - } - else { - const int16_t offset = (delta_fract[y] >> 1); - int16_t tmp[4]; - tmp[0] = 16 - offset; - tmp[1] = 32 - offset; - tmp[2] = 16 + offset; - tmp[3] = offset; - all_weights = _mm256_set1_epi64x(*(int64_t*)tmp); - } - // Do 4-tap intra interpolation filtering - uvg_pixel* p = (uvg_pixel*)ref_main; - // This solution assumes the delta int values to be 64-bit - // Cast from 16-bit to 64-bit. - __m256i vidx = _mm256_setr_epi64x(delta_int[y] + 0, - delta_int[y] + 4, - delta_int[y] + 8, - delta_int[y] + 12); - - //__m256i all_weights = _mm256_loadu_si256((__m256i*)f); + // Load and shuffle filter weights + __m128i vweights = _mm_load_si128((__m128i*)&filter[delta_fract[y]]); + __m256i vw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vweights), vweights, 1); - __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); - __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); + __m256i w01 = _mm256_shuffle_epi8(vw256, w_shuf_01); + __m256i w23 = _mm256_shuffle_epi8(vw256, w_shuf_23); - for (int_fast32_t x = 0; x < width; x += 16, p += 16) { - __m256i vp = _mm256_loadu_si256((__m256i*)(p + delta_int[y])); + for (int x = 0; x < width; x += 16) { + __m256i vp = _mm256_loadu_si256((__m256i*)(ref_main + x + delta_int[y])); __m256i tmp = _mm256_permute4x64_epi64(vp, _MM_SHUFFLE(2, 1, 1, 0)); @@ -4356,9 +4309,9 @@ static void uvg_angular_pred_avx2( switch (width) { case 4: angular_pred_w4_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; case 8: angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; - case 16: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 32: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; - case 64: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, use_cubic); break; + case 16: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; + case 32: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; + case 64: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; default: assert(false && "Intra angular predicion: illegal width.\n"); break; From c71974681cdb24f7139a6c852953297237520979 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:04:26 +0300 Subject: [PATCH 221/237] Add version of w8 vertical which handles 4 rows. This is more efficient than the version which calculates 2 rows. It is now only used when height < 4. --- src/strategies/avx2/intra-avx2.c | 116 ++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 8031b6ad..1c8dc736 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -305,6 +305,111 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } +static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) +{ + //const int width = 8; + + const __m128i p_shuf_01 = _mm_setr_epi8( + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, + 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x08 + ); + + const __m128i p_shuf_23 = _mm_setr_epi8( + 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, + 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a + ); + + const __m256i w_shuf_01_row01 = _mm256_setr_epi8( + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05 + ); + + const __m256i w_shuf_23_row01 = _mm256_setr_epi8( + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07 + ); + + const __m256i w_shuf_01_row23 = _mm256_setr_epi8( + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, + 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, + 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d, 0x0c, 0x0d + ); + + const __m256i w_shuf_23_row23 = _mm256_setr_epi8( + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, + 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, 0x0a, 0x0b, + 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, + 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f, 0x0e, 0x0f + ); + + // Do 4-tap intra interpolation filtering + // For a 8 width block, height must be at least 2. This version handles 4 lines at once to minimize vidx loads. + // No need to check height 2 cases, other function handles that. + for (int y = 0; y < height; y += 4) { + + // Load and shuffle filter weights + __m128i vidxw = _mm_load_si128((__m128i*) & delta_fract[y]); + __m128i vidxw32 = _mm_cvtepi16_epi32(vidxw); + __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidxw32, 4); + __m256i aw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); + + __m256i w01_row01 = _mm256_shuffle_epi8(aw256, w_shuf_01_row01); + __m256i w23_row01 = _mm256_shuffle_epi8(aw256, w_shuf_23_row01); + __m256i w01_row23 = _mm256_shuffle_epi8(aw256, w_shuf_01_row23); + __m256i w23_row23 = _mm256_shuffle_epi8(aw256, w_shuf_23_row23); + + // Load and shuffle reference pixels + __m128i vp0 = _mm_loadu_si128((__m128i*)(ref_main + delta_int[y + 0])); + __m128i vp1 = _mm_loadu_si128((__m128i*)(ref_main + delta_int[y + 1])); + __m128i vp2 = _mm_loadu_si128((__m128i*)(ref_main + delta_int[y + 2])); + __m128i vp3 = _mm_loadu_si128((__m128i*)(ref_main + delta_int[y + 3])); + + __m256i vp_01_row01 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_01)); + vp_01_row01 = _mm256_inserti128_si256(vp_01_row01, _mm_shuffle_epi8(vp1, p_shuf_01), 1); + + __m256i vp_23_row01 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp0, p_shuf_23)); + vp_23_row01 = _mm256_inserti128_si256(vp_23_row01, _mm_shuffle_epi8(vp1, p_shuf_23), 1); + + __m256i vp_01_row23 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp2, p_shuf_01)); + vp_01_row23 = _mm256_inserti128_si256(vp_01_row23, _mm_shuffle_epi8(vp3, p_shuf_01), 1); + + __m256i vp_23_row23 = _mm256_castsi128_si256(_mm_shuffle_epi8(vp2, p_shuf_23)); + vp_23_row23 = _mm256_inserti128_si256(vp_23_row23, _mm_shuffle_epi8(vp3, p_shuf_23), 1); + + __m256i vmadd01_row01 = _mm256_maddubs_epi16(vp_01_row01, w01_row01); + __m256i vmadd23_row01 = _mm256_maddubs_epi16(vp_23_row01, w23_row01); + __m256i vmadd01_row23 = _mm256_maddubs_epi16(vp_01_row23, w01_row23); + __m256i vmadd23_row23 = _mm256_maddubs_epi16(vp_23_row23, w23_row23); + + + __m256i sum01 = _mm256_add_epi16(vmadd01_row01, vmadd23_row01); + __m256i sum23 = _mm256_add_epi16(vmadd01_row23, vmadd23_row23); + sum01 = _mm256_add_epi16(sum01, _mm256_set1_epi16(32)); + sum23 = _mm256_add_epi16(sum23, _mm256_set1_epi16(32)); + sum01 = _mm256_srai_epi16(sum01, 6); + sum23 = _mm256_srai_epi16(sum23, 6); + + __m128i lo01 = _mm256_castsi256_si128(sum01); + __m128i hi01 = _mm256_extracti128_si256(sum01, 1); + __m128i lo23 = _mm256_castsi256_si128(sum23); + __m128i hi23 = _mm256_extracti128_si256(sum23, 1); + + __m128i packed01 = _mm_packus_epi16(lo01, hi01); + __m128i packed23 = _mm_packus_epi16(lo23, hi23); + //__m256i packed = _mm256_inserti128_si256(_mm256_castsi128_si256(packed01), packed23, 1); + + //_mm256_store_si256((__m256i*)dst, packed); + _mm_store_si128((__m128i*)(dst + 0), packed01); + _mm_store_si128((__m128i*)(dst + 16), packed23); + dst += 32; + } +} + static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int8_t(*filter)[4]) { const __m256i p_shuf_01 = _mm256_setr_epi8( @@ -4308,9 +4413,14 @@ static void uvg_angular_pred_avx2( if (vertical_mode) { switch (width) { case 4: angular_pred_w4_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; - case 8: angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); break; - case 16: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; - case 32: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; + case 8: + if (height < 4) + angular_pred_w8_h2_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); + else + angular_pred_w8_ver_avx2(dst, ref_main, delta_int, delta_fract, height, pfilter); + break; + case 16: // Use w16 function for all widths 16 and up + case 32: case 64: angular_pred_w16_ver_avx2(dst, ref_main, delta_int, delta_fract, width, height, pfilter); break; default: assert(false && "Intra angular predicion: illegal width.\n"); From 6cc5e277ebbd5f06c3f3bf186b6c4a2386b51b59 Mon Sep 17 00:00:00 2001 From: siivonek Date: Fri, 23 Aug 2024 15:24:28 +0300 Subject: [PATCH 222/237] Remove ref line nullification. Fix issue with ISP ref length calculation. --- src/strategies/avx2/intra-avx2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1c8dc736..acff56d2 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4303,8 +4303,8 @@ static void uvg_angular_pred_avx2( // Temporary buffer for modes 11-25. // It only needs to be big enough to hold indices from -width to width-1. - uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; + uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX]; + uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX]; int32_t pred_mode = intra_mode; // ToDo: handle WAIP @@ -4314,13 +4314,14 @@ static void uvg_angular_pred_avx2( // Modes distance to horizontal or vertical mode. Possible values: [-16, 16] // For pure vertical or horizontal modes, this is 0. For pure diagonal modes, this is either -16 or 16. const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -(pred_mode - 18); + const int_fast8_t abs_mode_disp = abs(mode_disp); const bool wide_angle_mode = mode_disp > 16; // Sample displacement per column in fractions of 32. - const int_fast16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; + const int_fast16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs_mode_disp]; const int side_size = vertical_mode ? log2_height : log2_width; - int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]); + int scale = MIN(2, side_size - pre_scale[abs_mode_disp]); // Pointer for the reference we are interpolating from. uvg_pixel* ref_main; @@ -4342,8 +4343,9 @@ static void uvg_angular_pred_avx2( ref_side = vertical_mode ? temp_side + width : temp_main + height; int size_side = vertical_mode ? height : width; + const int modedisp2invsampledisp_abs = modedisp2invsampledisp[abs_mode_disp]; for (int i = -size_side; i <= -1; i++) { - ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)]; + ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp_abs + 256) >> 9, size_side)]; } } else { From 5f4f9027f9b7bfd66818928d3252fb52db817887 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 26 Aug 2024 13:55:10 +0300 Subject: [PATCH 223/237] Change intra generic and avx2 code files from c to cpp. Modify Makefile accordingly. From fe6d1a2c6545b93790960c403a8020be0a58e862 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:05:37 +0300 Subject: [PATCH 224/237] Fix some of the segmentation faults. --- src/strategies/avx2/intra-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index acff56d2..47795a50 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -445,7 +445,7 @@ static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, for (int y = 0; y < height; ++y) { // Load and shuffle filter weights - __m128i vweights = _mm_load_si128((__m128i*)&filter[delta_fract[y]]); + __m128i vweights = _mm_loadu_si128((__m128i*)&filter[delta_fract[y]]); __m256i vw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vweights), vweights, 1); __m256i w01 = _mm256_shuffle_epi8(vw256, w_shuf_01); From 46e382e14712ab7e932de39928c2bcc48b907839 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 26 Aug 2024 15:10:49 +0300 Subject: [PATCH 225/237] Change some loads to unaligned load. There is no guarantee that these would be aligned, causing segfaults on linux. --- src/strategies/avx2/intra-avx2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 47795a50..a9d12ec3 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -272,7 +272,7 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, for (int y = 0; y < height; y += 2) { // Load and shuffle filter weights - __m128i vidxw = _mm_load_si128((__m128i*)&delta_fract[y]); + __m128i vidxw = _mm_loadu_si128((__m128i*)&delta_fract[y]); __m128i vidxw32 = _mm_cvtepi16_epi32(vidxw); __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidxw32, 4); __m256i aw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); @@ -353,7 +353,7 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, for (int y = 0; y < height; y += 4) { // Load and shuffle filter weights - __m128i vidxw = _mm_load_si128((__m128i*) & delta_fract[y]); + __m128i vidxw = _mm_loadu_si128((__m128i*) &delta_fract[y]); __m128i vidxw32 = _mm_cvtepi16_epi32(vidxw); __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidxw32, 4); __m256i aw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(all_weights), all_weights, 1); From 9a618910b19b66661ab1dbb1e688193285ed05d8 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:12:50 +0300 Subject: [PATCH 226/237] Improve reference building when both references are used for side ref sizes 4, 8 and 16. --- src/strategies/avx2/intra-avx2.c | 56 ++++++++++++++++++---- src/strategies/avx2/intra_avx2_tables.h | 63 +++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 8 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index a9d12ec3..f358eb62 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -239,7 +239,7 @@ static void angular_pred_w4_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } -static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) +static void angular_pred_w8_h2_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int height, const int8_t(*filter)[4]) { //const int width = 8; @@ -4336,16 +4336,56 @@ static void uvg_angular_pred_avx2( if (sample_disp < 0) { // In cases where sample_disp is negative, references are needed from both sides. // This step combines the main and side reference. - memcpy(&temp_main[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel)); - memcpy(&temp_side[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel)); + if (vertical_mode) { + memcpy(&temp_main[height], in_ref_above, (width + 2 + multi_ref_index) * sizeof(uvg_pixel)); + } + else { + memcpy(&temp_main[width], in_ref_left, (height + 2 + multi_ref_index) * sizeof(uvg_pixel)); + } + //memcpy(&temp_main[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel)); + //memcpy(&temp_side[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel)); - ref_main = vertical_mode ? temp_main + height : temp_side + width; - ref_side = vertical_mode ? temp_side + width : temp_main + height; + ref_main = vertical_mode ? &temp_main[height] : &temp_main[width]; + ref_side = vertical_mode ? in_ref_left : in_ref_above; int size_side = vertical_mode ? height : width; - const int modedisp2invsampledisp_abs = modedisp2invsampledisp[abs_mode_disp]; - for (int i = -size_side; i <= -1; i++) { - ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp_abs + 256) >> 9, size_side)]; + switch (size_side) { + case 4: + { + int shuf_offset = abs_mode_disp * 16; + __m128i vshuf = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_4[shuf_offset]); + __m128i vref = _mm_loadu_si128((const __m128i*) &ref_side[0]); + vref = _mm_shuffle_epi8(vref, vshuf); + uint32_t tmp = _mm_extract_epi32(vref, 0); + memcpy(&temp_main[0], &tmp, sizeof(uint32_t)); + break; + } + case 8: + { + int shuf_offset = abs_mode_disp * 16; + __m128i vshuf = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_8[shuf_offset]); + __m128i vref = _mm_loadu_si128((const __m128i*) &ref_side[0]); + vref = _mm_shuffle_epi8(vref, vshuf); + uint64_t tmp = _mm_extract_epi64(vref, 0); + memcpy(&temp_main[0], &tmp, sizeof(uint64_t)); + break; + } + case 16: + { + int shuf_offset = abs_mode_disp * 16; + __m128i vshuf = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_16[shuf_offset]); + __m128i vref = _mm_loadu_si128((const __m128i*) &ref_side[1]); // Offset ref by one to fit all necessary 16 refs. Offset accounted for in shuffle vectors. + vref = _mm_shuffle_epi8(vref, vshuf); + _mm_store_si128((__m128i*) &temp_main[0], vref); + break; + } + case 32: + case 64: + default: + const int modedisp2invsampledisp_abs = modedisp2invsampledisp[abs_mode_disp]; + for (int i = -size_side; i <= -1; i++) { + ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp_abs + 256) >> 9, size_side)]; + } } } else { diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index ef955369..19b36267 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3415,6 +3415,69 @@ static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { }; +// Intra ref building shuffle vector tables + +ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_4[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 + 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 1 + 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 2 + 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 3 + 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 4 + 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 5 + 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 6 + 0x04, 0x04, 0x04, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 7 + 0x04, 0x04, 0x04, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 8 + 0x04, 0x04, 0x04, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 9 + 0x04, 0x04, 0x04, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 10 + 0x04, 0x04, 0x04, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 11 + 0x04, 0x04, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 12 + 0x04, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 13 + 0x04, 0x04, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 14 + 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 15 + 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 16 +}; + +ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_8[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 1 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 2 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 3 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 4 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 5 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 6 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 7 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x05, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 8 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x07, 0x05, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 9 + 0x08, 0x08, 0x08, 0x08, 0x08, 0x06, 0x04, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 10 + 0x08, 0x08, 0x08, 0x08, 0x07, 0x05, 0x04, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 11 + 0x08, 0x08, 0x08, 0x08, 0x06, 0x05, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 12 + 0x08, 0x08, 0x08, 0x07, 0x06, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 13 + 0x08, 0x08, 0x07, 0x06, 0x05, 0x04, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 14 + 0x08, 0x08, 0x07, 0x06, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 15 + 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 16 +}; + +ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_16[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // mode disp 1 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // mode disp 2 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0a, // mode disp 3 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x07, // mode disp 4 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0a, 0x04, // mode disp 5 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0b, 0x07, 0x03, // mode disp 6 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0c, 0x09, 0x05, 0x02, // mode disp 7 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0c, 0x0a, 0x07, 0x04, 0x02, // mode disp 8 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0d, 0x0a, 0x08, 0x06, 0x04, 0x01, // mode disp 9 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, // mode disp 10 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0d, 0x0b, 0x0a, 0x08, 0x06, 0x04, 0x03, 0x01, // mode disp 11 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0d, 0x0c, 0x0a, 0x09, 0x07, 0x05, 0x04, 0x02, 0x01, // mode disp 12 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0e, 0x0d, 0x0c, 0x0a, 0x09, 0x07, 0x06, 0x05, 0x03, 0x02, 0x00, // mode disp 13 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0e, 0x0d, 0x0b, 0x0a, 0x09, 0x08, 0x06, 0x05, 0x04, 0x03, 0x01, 0x00, // mode disp 14 + 0x0f, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x03, 0x02, 0x01, 0x00, // mode disp 15 + 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, // mode disp 16 +}; + + // Y coord tables ALIGNED(32) static const int8_t planar_avx2_ver_w4ys[1024] = { From a3e1057a97d79bedeb072b92101fd676a05599db Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:14:20 +0300 Subject: [PATCH 227/237] Improve reference building for side reference size 32. --- src/strategies/avx2/intra-avx2.c | 31 +++++++++++++++++++++---- src/strategies/avx2/intra_avx2_tables.h | 19 +++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index f358eb62..768b7ad4 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4356,8 +4356,9 @@ static void uvg_angular_pred_avx2( __m128i vshuf = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_4[shuf_offset]); __m128i vref = _mm_loadu_si128((const __m128i*) &ref_side[0]); vref = _mm_shuffle_epi8(vref, vshuf); - uint32_t tmp = _mm_extract_epi32(vref, 0); - memcpy(&temp_main[0], &tmp, sizeof(uint32_t)); + /*uint32_t tmp = _mm_extract_epi32(vref, 0); + memcpy(&temp_main[0], &tmp, sizeof(uint32_t));*/ + _mm_maskstore_epi32((int32_t*)&temp_main[0], _mm_setr_epi32(0xffffffff, 0, 0, 0), vref); break; } case 8: @@ -4366,8 +4367,9 @@ static void uvg_angular_pred_avx2( __m128i vshuf = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_8[shuf_offset]); __m128i vref = _mm_loadu_si128((const __m128i*) &ref_side[0]); vref = _mm_shuffle_epi8(vref, vshuf); - uint64_t tmp = _mm_extract_epi64(vref, 0); - memcpy(&temp_main[0], &tmp, sizeof(uint64_t)); + /*uint64_t tmp = _mm_extract_epi64(vref, 0); + memcpy(&temp_main[0], &tmp, sizeof(uint64_t));*/ + _mm_maskstore_epi32((int32_t*)&temp_main[0], _mm_setr_epi32(0xffffffff, 0xffffffff, 0, 0), vref); break; } case 16: @@ -4380,6 +4382,27 @@ static void uvg_angular_pred_avx2( break; } case 32: + { + int shuf_offset = abs_mode_disp * 32; + __m128i vshufhi = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_32[shuf_offset + 0]); + __m128i vshuflo = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_32[shuf_offset + 16]); + __m128i vblend = _mm_cmpgt_epi8(vshuflo, _mm_set1_epi8(15)); + + __m128i vreflo = _mm_loadu_si128((const __m128i*) & ref_side[1]); // Offset ref by one to fit all necessary 16 refs. Offset accounted for in shuffle vectors. + __m128i vrefhi = _mm_loadu_si128((const __m128i*) & ref_side[17]); + + // Second half of references requires samples from both sides + __m128i vreftmphi = _mm_shuffle_epi8(vrefhi, vshuflo); + __m128i vreftmplo = _mm_shuffle_epi8(vreflo, vshuflo); + vreflo = _mm_blendv_epi8(vreftmplo, vreftmphi, vblend); + + // First half of references use references from the hi side only + vrefhi = _mm_shuffle_epi8(vrefhi, vshufhi); + + _mm_store_si128((__m128i*) &temp_main[0], vrefhi); + _mm_store_si128((__m128i*) &temp_main[16], vreflo); + break; + } case 64: default: const int modedisp2invsampledisp_abs = modedisp2invsampledisp[abs_mode_disp]; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 19b36267..c7f97a55 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3702,5 +3702,24 @@ ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1200] = { 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, // 1 }; +ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_32[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // mode disp 1 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f, // mode disp 2 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x14, 0x0a, // mode disp 3 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x17, 0x0f, 0x07, // mode disp 4 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1a, 0x14, 0x0f, 0x0a, 0x04, // mode disp 5 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0x07, 0x03, // mode disp 6 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1c, 0x19, 0x15, 0x12, 0x0f, 0x0c, 0x09, 0x05, 0x02, // mode disp 7 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1c, 0x1a, 0x17, 0x14, 0x12, 0x0f, 0x0c, 0x0a, 0x07, 0x04, 0x02, // mode disp 8 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1d, 0x1a, 0x18, 0x16, 0x14, 0x11, 0x0f, 0x0d, 0x0a, 0x08, 0x06, 0x04, 0x01, // mode disp 9 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1d, 0x1b, 0x19, 0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, // mode disp 10 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1d, 0x1b, 0x1a, 0x18, 0x16, 0x14, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x0a, 0x08, 0x06, 0x04, 0x03, 0x01, // mode disp 11 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1d, 0x1c, 0x1a, 0x19, 0x17, 0x15, 0x14, 0x12, 0x11, 0x0f, 0x0d, 0x0c, 0x0a, 0x09, 0x07, 0x05, 0x04, 0x02, 0x01, // mode disp 12 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1e, 0x1c, 0x1b, 0x19, 0x18, 0x17, 0x15, 0x14, 0x12, 0x11, 0x10, 0x0e, 0x0d, 0x0c, 0x0a, 0x09, 0x07, 0x06, 0x05, 0x03, 0x02, 0x00, // mode disp 13 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1e, 0x1d, 0x1b, 0x1a, 0x19, 0x18, 0x16, 0x15, 0x14, 0x13, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0b, 0x0a, 0x09, 0x08, 0x06, 0x05, 0x04, 0x03, 0x01, 0x00, // mode disp 14 + 0x1f, 0x1f, 0x1f, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x03, 0x02, 0x01, 0x00, // mode disp 15 + 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, // mode disp 16 +}; #endif INTRA_AVX2_TABLES_H From 6e3d53948d5b4c0a8b434135555aca243fa4e955 Mon Sep 17 00:00:00 2001 From: siivonek Date: Thu, 5 Sep 2024 12:40:01 +0300 Subject: [PATCH 228/237] Improve reference building for side reference size 64. --- src/strategies/avx2/intra-avx2.c | 52 +++++++++++++++++++++++++ src/strategies/avx2/intra_avx2_tables.h | 22 +++++++++++ 2 files changed, 74 insertions(+) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 768b7ad4..1f1836f4 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4404,7 +4404,59 @@ static void uvg_angular_pred_avx2( break; } case 64: + { + int shuf_offset = abs_mode_disp * 64; + __m128i vshuf0 = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_64[shuf_offset + 0]); + __m128i vshuf1 = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_64[shuf_offset + 16]); + __m128i vshuf2 = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_64[shuf_offset + 32]); + __m128i vshuf3 = _mm_load_si128((__m128i*) &intra_refbuild_shuffle_vectors_sidesize_64[shuf_offset + 48]); + + __m128i vref0 = _mm_loadu_si128((const __m128i*) &ref_side[ 0 + 1]); // Offset ref by one to fit all necessary 16 refs. Offset accounted for in shuffle vectors. + __m128i vref1 = _mm_loadu_si128((const __m128i*) &ref_side[16 + 1]); + __m128i vref2 = _mm_loadu_si128((const __m128i*) &ref_side[32 + 1]); + __m128i vref3 = _mm_loadu_si128((const __m128i*) &ref_side[48 + 1]); + + // First quarter of references use references from vref3 only + __m128i vrefout0 = _mm_shuffle_epi8(vref3, vshuf0); + + // Second quarter can require samples from vref3 and vref2 + __m128i vreftmp0 = _mm_shuffle_epi8(vref3, vshuf1); + __m128i vreftmp1 = _mm_shuffle_epi8(vref2, vshuf1); + __m128i vblend0 = _mm_cmpgt_epi8(vshuf1, _mm_set1_epi8(47)); + __m128i vrefout1 = _mm_blendv_epi8(vreftmp1, vreftmp0, vblend0); + + // Third quarter can require samples from vref3, vref2 and vref1 + vreftmp0 = _mm_shuffle_epi8(vref3, vshuf2); + vreftmp1 = _mm_shuffle_epi8(vref2, vshuf2); + __m128i vreftmp2 = _mm_shuffle_epi8(vref1, vshuf2); + vblend0 = _mm_cmpgt_epi8(vshuf2, _mm_set1_epi8(47)); + __m128i vblend1 = _mm_cmpgt_epi8(vshuf2, _mm_set1_epi8(31)); + + vreftmp0 = _mm_blendv_epi8(vreftmp1, vreftmp0, vblend0); + __m128i vrefout2 = _mm_blendv_epi8(vreftmp2, vreftmp0, vblend1); + + // Fourth quarter can require samples from vref3, vref2, vref1 and vref0 + vreftmp0 = _mm_shuffle_epi8(vref3, vshuf3); + vreftmp1 = _mm_shuffle_epi8(vref2, vshuf3); + vreftmp2 = _mm_shuffle_epi8(vref1, vshuf3); + __m128i vreftmp3 = _mm_shuffle_epi8(vref0, vshuf3); + + vblend0 = _mm_cmpgt_epi8(vshuf3, _mm_set1_epi8(47)); + vblend1 = _mm_cmpgt_epi8(vshuf3, _mm_set1_epi8(31)); + __m128i vblend2 = _mm_cmpgt_epi8(vshuf3, _mm_set1_epi8(15)); + + vreftmp0 = _mm_blendv_epi8(vreftmp1, vreftmp0, vblend0); + vreftmp0 = _mm_blendv_epi8(vreftmp2, vreftmp0, vblend1); + __m128i vrefout3 = _mm_blendv_epi8(vreftmp3, vreftmp0, vblend2); + + _mm_store_si128((__m128i*) &temp_main[0], vrefout0); + _mm_store_si128((__m128i*) &temp_main[16], vrefout1); + _mm_store_si128((__m128i*) &temp_main[32], vrefout2); + _mm_store_si128((__m128i*) &temp_main[48], vrefout3); + break; + } default: + // This should work in the case everything else fails. const int modedisp2invsampledisp_abs = modedisp2invsampledisp[abs_mode_disp]; for (int i = -size_side; i <= -1; i++) { ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp_abs + 256) >> 9, size_side)]; diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index c7f97a55..bd63cfed 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3722,4 +3722,26 @@ ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_32[] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, // mode disp 16 }; +ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_64[] = { + //<-v0----------------------------------------------------------------------------------------->||<-v1----------------------------------------------------------------------------------------->||<-v2----------------------------------------------------------------------------------------->||<-v3-----------------------------------------------------------------------------------------> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, // mode disp 1 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x2f, 0x1f, 0x0f, // mode disp 2 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x34, 0x2a, 0x1f, 0x14, 0x0a, // mode disp 3 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x37, 0x2f, 0x27, 0x1f, 0x17, 0x0f, 0x07, // mode disp 4 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3a, 0x34, 0x2f, 0x2a, 0x24, 0x1f, 0x1a, 0x14, 0x0f, 0x0a, 0x04, // mode disp 5 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3b, 0x37, 0x33, 0x2f, 0x2b, 0x27, 0x23, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0x07, 0x03, // mode disp 6 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3c, 0x39, 0x35, 0x32, 0x2f, 0x2c, 0x29, 0x25, 0x22, 0x1f, 0x1c, 0x19, 0x15, 0x12, 0x0f, 0x0c, 0x09, 0x05, 0x02, // mode disp 7 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3c, 0x3a, 0x37, 0x34, 0x32, 0x2f, 0x2c, 0x2a, 0x27, 0x24, 0x22, 0x1f, 0x1c, 0x1a, 0x17, 0x14, 0x12, 0x0f, 0x0c, 0x0a, 0x07, 0x04, 0x02, // mode disp 8 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3d, 0x3a, 0x38, 0x36, 0x34, 0x31, 0x2f, 0x2d, 0x2a, 0x28, 0x26, 0x24, 0x21, 0x1f, 0x1d, 0x1a, 0x18, 0x16, 0x14, 0x11, 0x0f, 0x0d, 0x0a, 0x08, 0x06, 0x04, 0x01, // mode disp 9 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3d, 0x3b, 0x39, 0x37, 0x35, 0x33, 0x31, 0x2f, 0x2d, 0x2b, 0x29, 0x27, 0x25, 0x23, 0x21, 0x1f, 0x1d, 0x1b, 0x19, 0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, // mode disp 10 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3d, 0x3b, 0x3a, 0x38, 0x36, 0x34, 0x33, 0x31, 0x2f, 0x2d, 0x2b, 0x2a, 0x28, 0x26, 0x24, 0x23, 0x21, 0x1f, 0x1d, 0x1b, 0x1a, 0x18, 0x16, 0x14, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x0a, 0x08, 0x06, 0x04, 0x03, 0x01, // mode disp 11 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3d, 0x3c, 0x3a, 0x39, 0x37, 0x35, 0x34, 0x32, 0x31, 0x2f, 0x2d, 0x2c, 0x2a, 0x29, 0x27, 0x25, 0x24, 0x22, 0x21, 0x1f, 0x1d, 0x1c, 0x1a, 0x19, 0x17, 0x15, 0x14, 0x12, 0x11, 0x0f, 0x0d, 0x0c, 0x0a, 0x09, 0x07, 0x05, 0x04, 0x02, 0x01, // mode disp 12 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3e, 0x3c, 0x3b, 0x39, 0x38, 0x37, 0x35, 0x34, 0x32, 0x31, 0x30, 0x2e, 0x2d, 0x2c, 0x2a, 0x29, 0x27, 0x26, 0x25, 0x23, 0x22, 0x20, 0x1f, 0x1e, 0x1c, 0x1b, 0x19, 0x18, 0x17, 0x15, 0x14, 0x12, 0x11, 0x10, 0x0e, 0x0d, 0x0c, 0x0a, 0x09, 0x07, 0x06, 0x05, 0x03, 0x02, 0x00, // mode disp 13 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3e, 0x3d, 0x3b, 0x3a, 0x39, 0x38, 0x36, 0x35, 0x34, 0x33, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2b, 0x2a, 0x29, 0x28, 0x26, 0x25, 0x24, 0x23, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1b, 0x1a, 0x19, 0x18, 0x16, 0x15, 0x14, 0x13, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0b, 0x0a, 0x09, 0x08, 0x06, 0x05, 0x04, 0x03, 0x01, 0x00, // mode disp 14 + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x03, 0x02, 0x01, 0x00, // mode disp 15 + 0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, // mode disp 16 + //<-v0----------------------------------------------------------------------------------------->||<-v1----------------------------------------------------------------------------------------->||<-v2----------------------------------------------------------------------------------------->||<-v3-----------------------------------------------------------------------------------------> +}; + #endif INTRA_AVX2_TABLES_H From 4e0c5295349b6ac24b6f9c5f8105188dd8655e26 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:36:05 +0300 Subject: [PATCH 229/237] Remove unused code. --- src/strategies/avx2/intra-avx2.c | 1373 +++--------------------------- 1 file changed, 129 insertions(+), 1244 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 1f1836f4..98413d7f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -851,20 +851,6 @@ static void angular_pred_w32_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } -static void angular_pred_generic_linear_filter(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int, const int16_t* delta_fract) -{ - // 2-tap filter - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - uvg_pixel ref1 = ref[x + delta_int[y] + 1]; - uvg_pixel ref2 = ref[x + delta_int[y] + 2]; - //dst[y * width + x] = ref1 + ((delta_fract[y] * (ref2 - ref1) + 16) >> 5); - dst[y * width + x] = ((32 - delta_fract[y]) * ref1 + delta_fract[y] * ref2 + 16) >> 5; - } - } -} - - // Linear interpolation filter for width 4 has a different call, since it uses premade tables for coefficients static void angular_pred_linear_filter_w4_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int32_t pred_mode) { @@ -913,7 +899,6 @@ static void angular_pred_linear_filter_w4_ver_avx2(uvg_pixel* dst, uvg_pixel* re } } - static void angular_pred_linear_filter_w8_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const int width = 8; @@ -953,7 +938,6 @@ static void angular_pred_linear_filter_w8_ver_avx2(uvg_pixel* dst, uvg_pixel* re } } - static void angular_pred_linear_filter_w16_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const __m128i v16s = _mm_set1_epi16(16); @@ -990,7 +974,6 @@ static void angular_pred_linear_filter_w16_ver_avx2(uvg_pixel* dst, uvg_pixel* r } } - static void angular_pred_linear_filter_w32_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int, const int pred_mode) { const __m256i v16s = _mm256_set1_epi16(16); @@ -1069,7 +1052,6 @@ static void angular_pred_linear_filter_w4_hor_avx2(uvg_pixel* dst, uvg_pixel* re } } - static void angular_pred_linear_filter_w8_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; @@ -1103,7 +1085,6 @@ static void angular_pred_linear_filter_w8_hor_avx2(uvg_pixel* dst, uvg_pixel* re } } - static void angular_pred_linear_filter_w16_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; @@ -1151,7 +1132,6 @@ static void angular_pred_linear_filter_w16_hor_avx2(uvg_pixel* dst, uvg_pixel* r } } - static void angular_pred_linear_filter_w32_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const int16_t* dint = delta_int; @@ -1203,118 +1183,6 @@ static void angular_pred_linear_filter_w32_hor_avx2(uvg_pixel* dst, uvg_pixel* r } -static void angular_pred_linear_filter_w8_ver_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) -{ - const int width = 8; - const int16_t* dint = delta_int; - const __m128i v16s = _mm_set1_epi16(16); - // Height has to be at least 2, handle 2 lines at once - for (int y = 0; y < height; y += 2) { - uvg_pixel src[32]; - int16_t coeff_tmp[2]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int yy = 0; yy < 2; ++yy) { - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[yy * 16 + d + 0] = ref[dint[yy] + 1 + x + 0]; - src[yy * 16 + d + 1] = ref[dint[yy] + 1 + x + 1]; - } - int8_t tmp[2] = { 32 - delta_fract[y + yy], delta_fract[y + yy] }; - coeff_tmp[yy] = *(int16_t*)tmp; - } - dint += 2; - - const __m128i vcoeff0 = _mm_set1_epi16(coeff_tmp[0]); - const __m128i vcoeff1 = _mm_set1_epi16(coeff_tmp[1]); - - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; - - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff0); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff1); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); - - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; - } -} - - -static void angular_pred_linear_filter_w16_ver_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) -{ - const int width = 16; - const int16_t* dint = delta_int; - const __m128i v16s = _mm_set1_epi16(16); - // Height has to be at least 2, handle 1 line at a time - for (int y = 0; y < height; ++y) { - uvg_pixel src[32]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[d + 0] = ref[*dint + 1 + x + 0]; - src[d + 1] = ref[*dint + 1 + x + 1]; - } - dint++; - - int8_t tmp[2] = { 32 - delta_fract[y], delta_fract[y] }; - const int16_t coeff_tmp = *(int16_t*)tmp; - const __m128i vcoeff = _mm_set1_epi16(coeff_tmp); - - const __m128i* vsrc0 = (const __m128i*) & src[0]; - const __m128i* vsrc1 = (const __m128i*) & src[16]; - - __m128i res0 = _mm_maddubs_epi16(*vsrc0, vcoeff); - __m128i res1 = _mm_maddubs_epi16(*vsrc1, vcoeff); - res0 = _mm_add_epi16(res0, v16s); - res1 = _mm_add_epi16(res1, v16s); - res0 = _mm_srai_epi16(res0, 5); - res1 = _mm_srai_epi16(res1, 5); - - _mm_store_si128((__m128i*)dst, _mm_packus_epi16(res0, res1)); - dst += 16; - } -} - - -static void angular_pred_linear_filter_w32_ver_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) -{ - const int width = 32; - const int16_t* dint = delta_int; - const __m256i v16s = _mm256_set1_epi16(16); - // Height has to be at least 2, handle 1 line at a time - for (int y = 0; y < height; ++y) { - uvg_pixel src[64]; - // TODO: get rid of this slow crap, this is just here to test the calculations - for (int x = 0, d = 0; x < width; ++x, d += 2) { - src[d + 0] = ref[*dint + 1 + x + 0]; - src[d + 1] = ref[*dint + 1 + x + 1]; - } - dint++; - - int8_t tmp[2] = { 32 - delta_fract[y], delta_fract[y] }; - const int16_t coeff_tmp = *(int16_t*)tmp; - const __m256i vcoeff = _mm256_set1_epi16(coeff_tmp); - - const __m256i* vsrc0 = (const __m256i*) & src[0]; - const __m256i* vsrc1 = (const __m256i*) & src[32]; - - __m256i res0 = _mm256_maddubs_epi16(*vsrc0, vcoeff); - __m256i res1 = _mm256_maddubs_epi16(*vsrc1, vcoeff); - res0 = _mm256_add_epi16(res0, v16s); - res1 = _mm256_add_epi16(res1, v16s); - res0 = _mm256_srai_epi16(res0, 5); - res1 = _mm256_srai_epi16(res1, 5); - - __m256i vfinal = _mm256_packus_epi16(res0, res1); - vfinal = _mm256_permute4x64_epi64(vfinal, _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_store_si256((__m256i*)dst, vfinal); - dst += 32; - } -} - - static void angular_pred_linear_filter_w4_hor_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int) { const __m128i v16s = _mm_set1_epi16(16); @@ -1361,7 +1229,6 @@ static void angular_pred_linear_filter_w4_hor_wide_angle_avx2(uvg_pixel* dst, uv } } - static void angular_pred_linear_filter_w8_hor_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 8; @@ -1437,7 +1304,6 @@ static void angular_pred_linear_filter_w8_hor_wide_angle_avx2(uvg_pixel* dst, uv } } - static void angular_pred_linear_filter_w16_hor_wide_angle_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int mode, const int16_t* delta_int, const int16_t* delta_fract) { const int width = 16; @@ -1533,7 +1399,7 @@ static void angular_pred_linear_filter_w16_hor_wide_angle_avx2(uvg_pixel* dst, u } } - +// Used for angles which do not require interpolation. static void angular_pred_non_fractional_angle_pxl_copy_ver_avx2(uvg_pixel* dst, uvg_pixel* ref, const int width, const int height, const int16_t* delta_int) { // Note: this probably won't work for wide angle modes. @@ -1550,6 +1416,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_ver_avx2(uvg_pixel* dst, } } +// Horizontal pixel copy for prediction mode 2. static void angular_pred_non_fractional_angle_pxl_copy_w4_mode2_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int multi_ref_offset) { // const int width = 4; @@ -1623,6 +1490,7 @@ static void angular_pred_non_fractional_angle_pxl_copy_w8_mode2_hor_avx2(uvg_pix } } +// Horizontal pixel copy for wide angles modes. static void angular_pred_non_fractional_angle_pxl_copy_w4_wide_angle_hor_avx2(uvg_pixel* dst, uvg_pixel* ref, const int height, const int16_t* delta_int) { // const int width = 4; @@ -2084,44 +1952,45 @@ static void angular_pred_non_fractional_angle_pxl_copy_w32_wide_angle_hor_avx2(u static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; - int16_t left[4][4]; - - int limit = MIN(3 << scale, width); - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2 width - __m256i v32s = _mm256_set1_epi16(32); + //ALIGNED(32) uint8_t left[4][4]; + __m128i v32s = _mm_set1_epi16(32); // Scale can be 0, 1 or 2 const int offset = scale * 16; - const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_ver_weight[offset]); + const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w4_ver_improved_weight[offset]); const int inv_angle_offset = mode_disp * 64; const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vleftshuf = _mm_setr_epi8( + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, + 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f + ); + + __m128i vidx = _mm_setr_epi32(shifted_inv_angle_sum[0], shifted_inv_angle_sum[1], + shifted_inv_angle_sum[2], shifted_inv_angle_sum[3]); + // For a 4 width block, height must be at least 4. Handle 4 lines at once. for (int y = 0; y < height; y += 4) { - for (int xx = 0; xx < width; ++xx) { - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - } + __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); + __m128i vleft = _mm_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidx, 1); + vleft = _mm_shuffle_epi8(vleft, vleftshuf); - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); + __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); + __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); + __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); + __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); + vmaddlo = _mm_add_epi16(vmaddlo, v32s); + vmaddhi = _mm_add_epi16(vmaddhi, v32s); - _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + vmaddlo = _mm_srai_epi16(vmaddlo, 6); + vmaddhi = _mm_srai_epi16(vmaddhi, 6); + + __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); + + _mm_store_si128((__m128i*)(dst + (y * width)), packed); } } @@ -2260,838 +2129,131 @@ static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re } } -static void angular_pdpc_ver_w16_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. - // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. - const int scale = 1; - const int log2_width = uvg_g_convert_to_log2[width]; - const int limit = 6; +// Mode 18 + +static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 4; + const int limit = MIN(3 << scale, height); - __m128i vseq = _mm_set_epi64x(1, 0); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); __m256i v32s = _mm256_set1_epi16(32); - const int offset = scale * 16; - const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_ver_weight[offset]); + const uint32_t ref4 = *(uint32_t*)&ref_side[1]; - const int inv_angle_offset = mode_disp * 64; - const int16_t *shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + __m128i vref = _mm_set1_epi32(ref4); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); - // For width 8, height must be at least 2. Handle 2 lines at once. - for (int y = 0; y < height; y += 2) { - ALIGNED(32) int16_t left[16] = { 0 }; - for (int yy = 0; yy < 2; ++yy) { - for (int xx = 0; xx < limit; ++xx) { - left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - } + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); + // Weight table offset + const int table_offset = scale * 64; - __m256i accu = _mm256_sub_epi16(vleft, vdst16); + for (int y = 0, o = 0; y < limit; y += 4, o += 16) { + const int offset = table_offset + o; + + __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); + + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w4_hor_weight[offset]); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); accu = _mm256_mullo_epi16(vweight, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); + accu = _mm256_add_epi16(vpred16, accu); __m128i lo = _mm256_castsi256_si128(accu); __m128i hi = _mm256_extracti128_si256(accu, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); - *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); } } -static void angular_pdpc_ver_w16_scale2_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { + const int width = 8; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); - const int scale = 2; // Other functions handle scales 0 and 1 - int limit = 12; // With scale 2, limit is always 12. - const int offset = scale * 16; - const int inv_angle_offset = mode_disp * 64; - const int shuf_offset = mode_disp * 16; - - const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w16_ver_weight[offset]); - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - const __m128i vshuf = _mm_load_si128((const __m128i*) &intra_pdpc_shuffle_vectors_w16_scale2_ver[shuf_offset]); + const uint64_t ref8 = *(uint64_t*)&ref_side[1]; - for (int y = 0; y < height; ++y) { - for (int x = 0; x < limit; x += 16) { - /*ALIGNED(32) int16_t left[16] = { 0 }; - for (int xx = 0; x + xx < limit; ++xx) { - left[xx] = ref_side[y + shifted_inv_angle_sum[xx] + 1]; - }*/ - __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); - vleft = _mm_shuffle_epi8(vleft, vshuf); + __m128i vref = _mm_set1_epi64x(ref8); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); - __m128i vdst = _mm_load_si128((const __m128i*)(dst + (y * width + x))); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); + // Weight table offset + const int table_offset = scale * 128; + + for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { + const __m256i vwT = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_hor_weight[o]); + + __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vpred16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); + } +} + +static void angular_pdpc_mode18_w16_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +{ + const int width = 16; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); + + __m128i vref = _mm_loadu_si128((const __m128i*) & ref_side[1]); + __m256i vref16 = _mm256_cvtepu8_epi16(vref); + + __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); + + // Handle one line at a time. Skip line if vertical limit reached. + for (int y = 0; y < limit; ++y) { + const int16_t wT = 32 >> (2 * (y + 0) >> scale); + __m256i vwT = _mm256_set1_epi16(wT); + + for (int x = 0; x < width; x += 16) { + __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); + __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); + + __m256i accu = _mm256_sub_epi16(vref16, vtopleft); + accu = _mm256_mullo_epi16(vwT, accu); accu = _mm256_add_epi16(accu, v32s); accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); + accu = _mm256_add_epi16(vpred16, accu); __m128i lo = _mm256_castsi256_si128(accu); __m128i hi = _mm256_extracti128_si256(accu, 1); __m128i filtered = _mm_packus_epi16(lo, hi); - _mm_store_si128((__m128i*)(dst + (y * width + x)), filtered); + _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); } } } - -static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_mode18_w32_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { - const int width = 4; - int16_t left[4][4]; + const int width = 32; + int limit = MIN(3 << scale, height); + __m256i v32s = _mm256_set1_epi16(32); - int limit = MIN(3 << scale, width); + __m128i vrefa = _mm_loadu_si128((const __m128i*) & ref_side[1]); + __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); - //__m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - //__m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2 width - __m256i v32s = _mm256_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int offset = scale * 16; - const int inv_angle_offset = mode_disp * 64; - const int shuf_offset = mode_disp * 16; - - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); - const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_w4_ver[shuf_offset]); - - // For a 4 width block, height must be at least 4. Handle 4 lines at once. - for (int y = 0; y < height; y += 4) { - /*for (int xx = 0; xx < width; ++xx) { - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - }*/ - __m128i vleft = _mm_loadu_si128((__m128i*)&ref_side[y + shifted_inv_angle_sum[0] + 1]); - vleft = _mm_shuffle_epi8(vleft, vshuf); - - //__m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vseq, 4); - __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_store_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -static void angular_pdpc_ver_4x4_scale0_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. - // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. - const int scale = 0; - int16_t left[4][4]; - const int log2_width = uvg_g_convert_to_log2[width]; - - const int limit = 3; - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m256i v32s = _mm256_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int offset = scale * 16; - const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); - - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - // For a 4 width block, height must be at least 4. Handle 4 lines at once. - for (int y = 0; y < height; y += 4) { - for (int xx = 0; xx < 4; ++xx) { - for (int yy = 0; yy < 4; ++yy) { - left[yy][xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - } - - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(filtered, 3); - } -} - -static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. - // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. - // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. - const int scale = 0; - int16_t left[4][4]; - const int log2_width = uvg_g_convert_to_log2[width]; - - const int limit = 3; - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m256i v32s = _mm256_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int offset = scale * 16; - const int inv_angle_offset = mode_disp * 64; - const int shuf_offset = mode_disp * 16; - - const __m256i vweight = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_ver_weight[offset]); - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - const __m128i vshuf = _mm_loadu_si128((__m128i*) &intra_pdpc_shuffle_vectors_w4_ver[shuf_offset]); - - // For a 4 width block, height must be at least 4. Handle 4 lines at once. - for (int y = 0; y < height; y += 4) { - __m128i vleft = _mm_loadu_si128((__m128i*) &ref_side[y + shifted_inv_angle_sum[0] + 1]); - vleft = _mm_shuffle_epi8(vleft, vshuf); - - __m128i vdst = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - *(uint32_t*)(dst + (y + 0) * width) = _mm_extract_epi32(filtered, 0); - *(uint32_t*)(dst + (y + 1) * width) = _mm_extract_epi32(filtered, 1); - *(uint32_t*)(dst + (y + 2) * width) = _mm_extract_epi32(filtered, 2); - *(uint32_t*)(dst + (y + 3) * width) = _mm_extract_epi32(filtered, 3); - } -} - - -static void angular_pdpc_ver_8x2_scale1_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. - // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. - const int scale = 1; - const int log2_width = uvg_g_convert_to_log2[width]; - - const int limit = 6; - - __m128i vseq = _mm_set_epi64x(1, 0); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m256i v32s = _mm256_set1_epi16(32); - - const int offset = scale * 16; - const int inv_angle_offset = mode_disp * 64; - const int shuf_offset = mode_disp * 16; - - const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - const __m128i vshuf = _mm_loadu_si128((__m128i*) & intra_pdpc_shuffle_vectors_8x2_scale1_ver[shuf_offset]); - - // For width 8, height must be at least 2. Handle 2 lines at once. - for (int y = 0; y < height; y += 2) { - ALIGNED(32) int16_t left[16] = { 0 }; - for (int yy = 0; yy < 2; ++yy) { - for (int xx = 0; xx < limit; ++xx) { - left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - } - //__m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); - //vleft = _mm_shuffle_epi8(vleft, vshuf); - - __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - //__m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. - // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity - if (width == 8) { - _mm_store_si128((__m128i*)(dst + (y * width)), filtered); - } - else { - *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); - *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); - } - } -} - -static void angular_pdpc_ver_8x2_scale2_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. - // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. - // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. - const int scale = 2; - const int log2_width = uvg_g_convert_to_log2[width]; - - const int limit = 6; - - __m128i vseq = _mm_set_epi64x(1, 0); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m256i v32s = _mm256_set1_epi16(32); - - const int offset = scale * 16; - const int inv_angle_offset = mode_disp * 64; - const int shuf_offset = mode_disp * 16; - - const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - const __m128i vshuf = _mm_loadu_si128((__m128i*) & intra_pdpc_shuffle_vectors_8x2_scale2_ver[shuf_offset]); - - // For width 8, height must be at least 2. Handle 2 lines at once. - for (int y = 0; y < height; y += 2) { - /*ALIGNED(32) int16_t left[16] = { 0 }; - for (int yy = 0; yy < 2; ++yy) { - for (int xx = 0; xx < limit; ++xx) { - left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - }*/ - __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); - vleft = _mm_shuffle_epi8(vleft, vshuf); - - __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - //__m256i vleft = _mm256_loadu_si256((__m256i*)left); - - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. - // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity - if (width == 8) { - _mm_store_si128((__m128i*)(dst + (y * width)), filtered); - } - else { - *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); - *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); - } - } -} - -static void angular_pdpc_ver_8x2_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. - // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. - // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. - const int scale = 1; - const int log2_width = uvg_g_convert_to_log2[width]; - - const int limit = 6; - - __m128i vseq = _mm_set_epi64x(1, 0); - __m128i vidx = _mm_slli_epi32(vseq, log2_width); - __m256i v32s = _mm256_set1_epi16(32); - - const int offset = scale * 16; - const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); - - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - // For width 8, height must be at least 2. Handle 2 lines at once. - for (int y = 0; y < height; y += 2) { - ALIGNED(32) int16_t left[16] = { 0 }; - for (int yy = 0; yy < 2; ++yy) { - for (int xx = 0; xx < limit; ++xx) { - left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; - } - } - - __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. - // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity - if (width == 8) { - _mm_store_si128((__m128i*)(dst + (y * width)), filtered); - } - else { - *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); - *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); - } - } -} - - -// Height versions of vertical PDPC, these are unused but left here for archiving purposes. Maybe this method can be refined to be effective. - -static void angular_pdpc_ver_h4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int scale, const int16_t inv_sample_disp) -{ - const int height = 4; - - int limit = MIN(3 << scale, width); - const int log2_width = uvg_g_convert_to_log2[width]; - - const __m256i v32s = _mm256_set1_epi16(32); - const __m256i wL_shuffle = _mm256_setr_epi8( - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, - 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, - 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a, 0x08, 0x0a - ); - - for (int x = 0; x < limit; x += 4) { - int shifted_inv_angle_sum[4] = {0}; - int16_t wL[4] = {0}; - ALIGNED(32) uvg_pixel tmp[16]; - for (int xx = 0; xx < 4; ++xx) { - shifted_inv_angle_sum[xx] = (256 + (x + xx + 1) * inv_sample_disp) >> 9; - wL[xx] = (x + xx) < limit ? 32 >> ((2 * (x + xx)) >> scale) : 0; - - tmp[xx * 4 + 0] = ref_side[0 + shifted_inv_angle_sum[xx] + 1]; - tmp[xx * 4 + 1] = ref_side[1 + shifted_inv_angle_sum[xx] + 1]; - tmp[xx * 4 + 2] = ref_side[2 + shifted_inv_angle_sum[xx] + 1]; - tmp[xx * 4 + 3] = ref_side[3 + shifted_inv_angle_sum[xx] + 1]; - - } - - int16_t tmp_dst[16]; - for (int yy = 0; yy < height; ++yy) { - tmp_dst[0 + yy] = dst[yy * width + x + 0]; - tmp_dst[4 + yy] = dst[yy * width + x + 1]; - tmp_dst[8 + yy] = dst[yy * width + x + 2]; - tmp_dst[12 + yy] = dst[yy * width + x + 3]; - } - - __m256i* vdst16 = (__m256i*)tmp_dst; - __m128i vleft = _mm_load_si128((__m128i*)tmp); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - __m256i accu = _mm256_sub_epi16(vleft16, *vdst16); - __m256i vwL = _mm256_setr_epi64x(wL[0], wL[1], wL[2], wL[3]); - vwL = _mm256_shuffle_epi8(vwL, wL_shuffle); - accu = _mm256_mullo_epi16(vwL, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(*vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - const uvg_pixel* result = (uvg_pixel*)&filtered; - - for (int yy = 0; yy < height; ++yy) { - dst[yy * width + x + 0] = result[0 + yy]; - dst[yy * width + x + 1] = result[4 + yy]; - dst[yy * width + x + 2] = result[8 + yy]; - dst[yy * width + x + 3] = result[12 + yy]; - } - } -} - -static void angular_pdpc_ver_h8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int scale, const int16_t inv_sample_disp) -{ - const int height = 8; - - int limit = MIN(3 << scale, width); - __m256i v32s = _mm256_set1_epi16(32); - - for (int x = 0; x < limit; x += 2) { - int shifted_inv_angle_sum0 = (256 + (x + 0 + 1) * inv_sample_disp) >> 9; - int shifted_inv_angle_sum1 = (256 + (x + 1 + 1) * inv_sample_disp) >> 9; - __m128i vwL[2]; - const int16_t wL0 = 32 >> ((2 * (x + 0)) >> scale); - const int16_t wL1 = (x + 1) < limit ? 32 >> ((2 * (x + 1)) >> scale) : 0; - vwL[0] = _mm_set1_epi16(wL0); - vwL[1] = _mm_set1_epi16(wL1); - - ALIGNED(32) int16_t tmp_dst[16]; - for (int yy = 0; yy < height; ++yy) { - tmp_dst[0 + yy] = dst[(yy) * width + x + 0]; - tmp_dst[8 + yy] = dst[(yy) * width + x + 1]; - } - - ALIGNED(32) uvg_pixel left[16]; - memcpy(&left[0], &ref_side[shifted_inv_angle_sum0 + 1], 8 * sizeof(uvg_pixel)); - memcpy(&left[8], &ref_side[shifted_inv_angle_sum1 + 1], 8 * sizeof(uvg_pixel)); - - __m256i vdst16 = _mm256_load_si256((__m256i*)tmp_dst); - __m128i vleft = _mm_load_si128((__m128i*)left); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - __m256i* vwL256 = (__m256i*)vwL; - - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); - accu = _mm256_mullo_epi16(*vwL256, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - const uvg_pixel* result = (uvg_pixel*)&filtered; - for (int yy = 0; yy < height; ++yy) { - dst[(yy) * width + x + 0] = result[0 + yy]; - dst[(yy) * width + x + 1] = result[8 + yy]; - } - - } -} - -static void angular_pdpc_ver_h16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int16_t inv_sample_disp) -{ - int limit = MIN(3 << scale, width); - __m256i v32s = _mm256_set1_epi16(32); - - for (int x = 0; x < limit; ++x) { - int shifted_inv_angle_sum = (256 + (x + 1) * inv_sample_disp) >> 9; - const int16_t wL = 32 >> ((2 * x) >> scale); - const __m256i vwL = _mm256_set1_epi16(wL); - - for (int y = 0; y < height; y += 16) { - ALIGNED(32) int16_t tmp_dst[16]; - for (int yy = 0; yy < 16; ++yy) { - tmp_dst[yy] = dst[(y + yy) * width + x]; - } - __m256i vdst16 = _mm256_load_si256((__m256i*)tmp_dst); - __m128i vleft = _mm_loadu_si128((__m128i*)&ref_side[y + shifted_inv_angle_sum + 1]); - __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); - - __m256i accu = _mm256_sub_epi16(vleft16, vdst16); - accu = _mm256_mullo_epi16(vwL, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - const uvg_pixel* result = (uvg_pixel*)&filtered; - for (int yy = 0; yy < 16; ++yy) { - dst[(y + yy) * width + x] = result[yy]; - } - } - } -} - -static void angular_pdpc_hor_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) -{ - const int width = 4; - - int16_t wT[4]; - int8_t ref_top[4][4]; - - int limit = MIN(3 << scale, height); - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width - __m256i v32s = _mm256_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int table_offset = scale * 64; - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - for (int y = 0, o = 0; y < limit; y += 4, o += 16) { - for (int yy = 0; yy < 4; ++yy) { - memcpy(ref_top[yy], &ref_side[shifted_inv_angle_sum[y + yy] + 1], 4 * sizeof(int8_t)); - } - const int offset = table_offset + o; - - __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m128i vtop = _mm_loadu_si128((__m128i*)ref_top); - __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); - __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w4_hor_weight[offset]); - - __m256i accu = _mm256_sub_epi16(vtop16, vpred16); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) -{ - const int width = 8; - - int limit = MIN(3 << scale, height); - - __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width - __m256i v32s = _mm256_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int table_offset = scale * 128; - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { - const __m256i vwT = _mm256_load_si256((const __m256i*)&intra_pdpc_w8_hor_weight[o]); - - ALIGNED(32) uvg_pixel tmp[16]; - memcpy(&tmp[0], &ref_side[shifted_inv_angle_sum[y + 0] + 1], 8 * sizeof(uvg_pixel)); - memcpy(&tmp[8], &ref_side[shifted_inv_angle_sum[y + 1] + 1], 8 * sizeof(uvg_pixel)); - - __m128i vpred = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m128i vtop = _mm_load_si128((__m128i*)tmp); - __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); - - __m256i accu = _mm256_sub_epi16(vtop16, vpred16); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) -{ - int limit = MIN(3 << scale, height); - __m256i v32s = _mm256_set1_epi16(32); - - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - // Handle one line at a time. Skip line if vertical limit reached. - for (int y = 0; y < limit; ++y) { - const int16_t wT = 32 >> (2 * (y + 0) >> scale); - __m256i vwT = _mm256_set1_epi16(wT); - - for (int x = 0; x < width; x += 16) { - __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m128i vtop = _mm_loadu_si128((__m128i*)&ref_side[x + shifted_inv_angle_sum[y] + 1]); - __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); - - __m256i accu = _mm256_sub_epi16(vtop16, vpred16); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); - } - } -} - - -static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) -{ - const int width = 4; - - int16_t wT[4]; - int8_t ref_top[4][4]; - - int limit = MIN(3 << scale, height); - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width - __m256i v32s = _mm256_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int table_offset = scale * 64; - const int shuf_offset = mode_disp * 256; - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - for (int y = 0, o = 0; y < limit; y += 4, o += 16) { - const __m128i vshuf = _mm_loadu_si128((__m128i*)&intra_pdpc_shuffle_vectors_w4_hor[shuf_offset + o]); - /*for (int yy = 0; yy < 4; ++yy) { - memcpy(ref_top[yy], &ref_side[shifted_inv_angle_sum[y + yy] + 1], 4 * sizeof(int8_t)); - }*/ - - __m128i vtop = _mm_loadu_si128((__m128i*)&ref_side[shifted_inv_angle_sum[y] + 1]); - vtop = _mm_shuffle_epi8(vtop, vshuf); - - const int offset = table_offset + o; - - __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - //__m128i vtop = _mm_loadu_si128((__m128i*)ref_top); - __m256i vtop16 = _mm256_cvtepu8_epi16(vtop); - __m256i vwT = _mm256_load_si256((const __m256i*) & intra_pdpc_w4_hor_weight[offset]); - - __m256i accu = _mm256_sub_epi16(vtop16, vpred16); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - - -// Improved PDPC functions. These use the streamlined PDPC equation - -// Mode 18 - -static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 4; - const int limit = MIN(3 << scale, height); - - __m256i v32s = _mm256_set1_epi16(32); - - const uint32_t ref4 = *(uint32_t*)&ref_side[1]; - - __m128i vref = _mm_set1_epi32(ref4); - __m256i vref16 = _mm256_cvtepu8_epi16(vref); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Weight table offset - const int table_offset = scale * 64; - - for (int y = 0, o = 0; y < limit; y += 4, o += 16) { - const int offset = table_offset + o; - - __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); - - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w4_hor_weight[offset]); - - __m256i accu = _mm256_sub_epi16(vref16, vtopleft); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 8; - int limit = MIN(3 << scale, height); - - __m256i v32s = _mm256_set1_epi16(32); - - const uint64_t ref8 = *(uint64_t*)&ref_side[1]; - - __m128i vref = _mm_set1_epi64x(ref8); - __m256i vref16 = _mm256_cvtepu8_epi16(vref); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Weight table offset - const int table_offset = scale * 128; - - for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { - const __m256i vwT = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_hor_weight[o]); - - __m128i vpred = _mm_load_si128((__m128i*)(dst + y * width)); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - - __m256i accu = _mm256_sub_epi16(vref16, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -// Can't do anything to improve w16 - -static void angular_pdpc_mode18_w32_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 32; - int limit = MIN(3 << scale, height); - __m256i v32s = _mm256_set1_epi16(32); - - __m128i vrefa = _mm_loadu_si128((const __m128i*) & ref_side[1]); - __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); - - __m128i vrefb = _mm_loadu_si128((const __m128i*) & ref_side[17]); - __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); + __m128i vrefb = _mm_loadu_si128((const __m128i*) & ref_side[17]); + __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); @@ -3204,9 +2366,6 @@ static void angular_pdpc_mode18_w64_improved_avx2(uvg_pixel* dst, const uvg_pixe } -// Mode 50 - - // Vertical modes static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) @@ -3815,294 +2974,7 @@ static void angular_pdpc_hor_w16_improved_avx2(uvg_pixel* dst, const uvg_pixel* } -// This is the non-vectorized version of pdpc mode 18. It is left here for archiving purposes. -static void angular_pdpc_mode18_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) -{ - const int limit = MIN(3 << scale, height); - for (int_fast32_t x = 0; x < width; ++x) { - const uvg_pixel ref_top = ref_side[1 + x]; - for (int yy = 0; yy < limit; ++yy) { - const int wT = 32 >> ((yy * 2) >> scale); - const uvg_pixel val = dst[yy * width + x]; - dst[yy * width + x] = CLIP_TO_PIXEL(val + (((ref_top - top_left) * wT + 32) >> 6)); - } - } -} - -static void angular_pdpc_mode18_w4_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 4; - const int limit = MIN(3 << scale, height); - - __m128i vseq = _mm_setr_epi32(0, 1, 2, 3); - __m128i vidx = _mm_slli_epi32(vseq, 2); // 2 is log2_width - __m256i v32s = _mm256_set1_epi16(32); - - const uint32_t ref4 = *(uint32_t*)&ref_side[1]; - - __m128i vref = _mm_set1_epi32(ref4); - __m256i vref16 = _mm256_cvtepu8_epi16(vref); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Weight table offset - const int table_offset = scale * 64; - - for (int y = 0, o = 0; y < limit; y += 4, o += 16) { - const int offset = table_offset + o; - - __m128i vpred = _mm_i32gather_epi32((const int32_t*)(dst + y * width), vidx, 1); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - __m256i vwT = _mm256_load_si256((const __m256i*) &intra_pdpc_w4_hor_weight[offset]); - - __m256i accu = _mm256_sub_epi16(vref16, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -static void angular_pdpc_mode18_w8_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 8; - - int limit = MIN(3 << scale, height); - - __m128i vseq = _mm_setr_epi32(0x00, 0x00, 0x01, 0x00); - __m128i vidx = _mm_slli_epi64(vseq, 3); // 3 is log2 width - __m256i v32s = _mm256_set1_epi16(32); - - const uint64_t ref8 = *(uint64_t*)&ref_side[1]; - - __m128i vref = _mm_set1_epi64x(ref8); - __m256i vref16 = _mm256_cvtepu8_epi16(vref); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Weight table offset - const int table_offset = scale * 128; - - for (int y = 0, o = table_offset; y < limit; y += 2, o += 16) { - const __m256i vwT = _mm256_load_si256((const __m256i*) &intra_pdpc_w8_hor_weight[o]); - - __m128i vpred = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - - __m256i accu = _mm256_sub_epi16(vref16, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width)), filtered); - } -} - -static void angular_pdpc_mode18_w16_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 16; - int limit = MIN(3 << scale, height); - __m256i v32s = _mm256_set1_epi16(32); - - __m128i vref = _mm_loadu_si128((const __m128i*)&ref_side[1]); - __m256i vref16 = _mm256_cvtepu8_epi16(vref); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Handle one line at a time. Skip line if vertical limit reached. - for (int y = 0; y < limit; ++y) { - const int16_t wT = 32 >> (2 * (y + 0) >> scale); - __m256i vwT = _mm256_set1_epi16(wT); - - for (int x = 0; x < width; x += 16) { - __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + x))); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - - __m256i accu = _mm256_sub_epi16(vref16, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + x)), filtered); - } - } -} - -static void angular_pdpc_mode18_w32_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 32; - int limit = MIN(3 << scale, height); - __m256i v32s = _mm256_set1_epi16(32); - - __m128i vrefa = _mm_loadu_si128((const __m128i*) &ref_side[1]); - __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); - - __m128i vrefb = _mm_loadu_si128((const __m128i*) &ref_side[17]); - __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Handle one line at a time. Skip line if vertical limit reached. - for (int y = 0; y < limit; ++y) { - const int16_t wT = 32 >> (2 * (y + 0) >> scale); - __m256i vwT = _mm256_set1_epi16(wT); - - // Calculate first half - __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + 0))); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - - __m256i accu = _mm256_sub_epi16(vref16a, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + 0)), filtered); - - // Calculate second half - vpred = _mm_load_si128((__m128i*)(dst + (y * width + 16))); - vpred16 = _mm256_cvtepu8_epi16(vpred); - - accu = _mm256_sub_epi16(vref16b, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - lo = _mm256_castsi256_si128(accu); - hi = _mm256_extracti128_si256(accu, 1); - filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + 16)), filtered); - } -} - -static void angular_pdpc_mode18_w64_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) -{ - const int width = 64; - int limit = MIN(3 << scale, height); - __m256i v32s = _mm256_set1_epi16(32); - - __m128i vrefa = _mm_loadu_si128((const __m128i*) &ref_side[0 + 1]); - __m256i vref16a = _mm256_cvtepu8_epi16(vrefa); - - __m128i vrefb = _mm_loadu_si128((const __m128i*) &ref_side[16 + 1]); - __m256i vref16b = _mm256_cvtepu8_epi16(vrefb); - - __m128i vrefc = _mm_loadu_si128((const __m128i*) &ref_side[32 + 1]); - __m256i vref16c = _mm256_cvtepu8_epi16(vrefc); - - __m128i vrefd = _mm_loadu_si128((const __m128i*) &ref_side[48 + 1]); - __m256i vref16d = _mm256_cvtepu8_epi16(vrefd); - - __m256i vtopleft = _mm256_set1_epi16((uint16_t)top_left); - - // Handle one line at a time. Skip line if vertical limit reached. - for (int y = 0; y < limit; ++y) { - const int16_t wT = 32 >> (2 * (y + 0) >> scale); - __m256i vwT = _mm256_set1_epi16(wT); - - // Calculate first quarter - __m128i vpred = _mm_load_si128((__m128i*)(dst + (y * width + 0))); - __m256i vpred16 = _mm256_cvtepu8_epi16(vpred); - - __m256i accu = _mm256_sub_epi16(vref16a, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + 0)), filtered); - - // Calculate second quarter - vpred = _mm_load_si128((__m128i*)(dst + (y * width + 16))); - vpred16 = _mm256_cvtepu8_epi16(vpred); - - accu = _mm256_sub_epi16(vref16b, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - lo = _mm256_castsi256_si128(accu); - hi = _mm256_extracti128_si256(accu, 1); - filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + 16)), filtered); - - // Calculate third quarter - vpred = _mm_load_si128((__m128i*)(dst + (y * width + 32))); - vpred16 = _mm256_cvtepu8_epi16(vpred); - - accu = _mm256_sub_epi16(vref16c, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - lo = _mm256_castsi256_si128(accu); - hi = _mm256_extracti128_si256(accu, 1); - filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + 32)), filtered); - - // Calculate fourth quarter - vpred = _mm_load_si128((__m128i*)(dst + (y * width + 48))); - vpred16 = _mm256_cvtepu8_epi16(vpred); - - accu = _mm256_sub_epi16(vref16d, vtopleft); - accu = _mm256_mullo_epi16(vwT, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vpred16, accu); - - lo = _mm256_castsi256_si128(accu); - hi = _mm256_extracti128_si256(accu, 1); - filtered = _mm_packus_epi16(lo, hi); - - _mm_storeu_si128((__m128i*)(dst + (y * width + 48)), filtered); - } -} - - -// This is the non-vectorized version of pdpc mode 50. It is left here for archiving purposes. -static void angular_pdpc_mode50_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int width, const int height, const int scale) -{ - const int limit = MIN(3 << scale, width); - for (int y = 0; y < height; ++y) { - const uvg_pixel left = ref_side[1 + y]; - for (int x = 0; x < limit; x++) { - const int wL = 32 >> (2 * x >> scale); - const uvg_pixel val = dst[y * width + x]; - dst[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); - } - } -} - +// Prediction mode 50 versions of PDPC functions. static void angular_pdpc_mode50_w4_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { const int width = 4; @@ -4266,6 +3138,19 @@ static void angular_pdpc_mode50_scale1_avx2(uvg_pixel* dst, const uvg_pixel top_ } } +// The main angular prediction entry point for AVX2. +/** + * \brief AVX2 version of angular intra prediction. + * \param cu_loc CU location and size data. + * \param intra_mode Intra prediction mode. + * \param channel_type Color channel. + * \param in_ref_above Pointer to -1 index of above reference. + * \param in_ref_left Pointer to -1 index of left reference. + * \param dst Buffer of size MAX_PRED_WIDTH * MAX_PRED_WIDTH. + * \param multi_ref_idx Multi reference index. + * \param isp_mode Intra sub-partition mode. + * \param cu_dim CU dimension, used along ISP mode. + */ static void uvg_angular_pred_avx2( const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, From 3ffbb6507efa2c06729f2cc3b28323dd01b4e977 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:41:17 +0300 Subject: [PATCH 230/237] Remove unused code. --- src/strategies/avx2/intra-avx2.c | 383 +------------------------------ 1 file changed, 4 insertions(+), 379 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 98413d7f..fe2f06eb 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -5559,78 +5559,6 @@ static void mip_upsampling_w16_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst_ptr + dst_step * 3), vtmp3); } -static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 4; // width / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - - __m128i vshuf = _mm_setr_epi8( - 0x00, 0x08, 0x01, 0x09, 0x02, 0x0a, 0x03, 0x0b, - 0x04, 0x0c, 0x05, 0x0d, 0x06, 0x0e, 0x07, 0x0f - ); - - __m128i vrnd = _mm_set1_epi16(rounding_offset); - - ALIGNED(32) int16_t refs[8]; - ALIGNED(32) int16_t srcs[8]; - const uvg_pixel* ref_ptr = ref + ref_step - 1; - const uvg_pixel* src_ptr = src; - - int step = ref_step; - - for (int i = 0; i < 8; i++) { - for (int ref = 0; ref < 8; ++ref) { - refs[ref] = *ref_ptr; - srcs[ref] = *src_ptr; - - ref_ptr += step; - src_ptr += red_pred_size; - } - - __m128i vaccu_ref = _mm_load_si128((__m128i*)refs); - __m128i vsub_ref = vaccu_ref; - vaccu_ref = _mm_slli_epi16(vaccu_ref, log2_factor); - - __m128i vaccu_src = _mm_setzero_si128(); - __m128i vadd_src = _mm_load_si128((__m128i*)srcs); - - __m128i vres[4]; - for (int res = 0; res < 4; ++res) { - vaccu_ref = _mm_sub_epi16(vaccu_ref, vsub_ref); - vaccu_src = _mm_add_epi16(vaccu_src, vadd_src); - vres[res] = _mm_add_epi16(vaccu_ref, vaccu_src); - vres[res] = _mm_add_epi16(vres[res], vrnd); - vres[res] = _mm_srli_epi16(vres[res], log2_factor); - } - - __m128i vout0 = _mm_packus_epi16(vres[0], vres[1]); - __m128i vout1 = _mm_packus_epi16(vres[2], vres[3]); - vout0 = _mm_shuffle_epi8(vout0, vshuf); - vout1 = _mm_shuffle_epi8(vout1, vshuf); - - __m128i vtmp16lo = _mm_unpacklo_epi16(vout0, vout1); - __m128i vtmp16hi = _mm_unpackhi_epi16(vout0, vout1); - - const int dst_offset = i * 4; - - *(uint32_t*)&dst[dst_offset + dst_step * 0] = _mm_extract_epi32(vtmp16lo, 0); - *(uint32_t*)&dst[dst_offset + dst_step * 1] = _mm_extract_epi32(vtmp16lo, 1); - *(uint32_t*)&dst[dst_offset + dst_step * 2] = _mm_extract_epi32(vtmp16lo, 2); - *(uint32_t*)&dst[dst_offset + dst_step * 3] = _mm_extract_epi32(vtmp16lo, 3); - *(uint32_t*)&dst[dst_offset + dst_step * 4] = _mm_extract_epi32(vtmp16hi, 0); - *(uint32_t*)&dst[dst_offset + dst_step * 5] = _mm_extract_epi32(vtmp16hi, 1); - *(uint32_t*)&dst[dst_offset + dst_step * 6] = _mm_extract_epi32(vtmp16hi, 2); - *(uint32_t*)&dst[dst_offset + dst_step * 7] = _mm_extract_epi32(vtmp16hi, 3); - - ref_ptr = src + i; - src_ptr = src + i + 1; - step = red_pred_size; // Switch ref step - } -} - static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; @@ -6211,7 +6139,7 @@ static void mip_upsampling_w8_ups2_h16_ver_avx2(uvg_pixel* const dst, const uvg_ _mm_store_si128((__m128i*)(dst + 96), vres2); _mm_store_si128((__m128i*)(dst + 112), vres3); } -// + static void mip_upsampling_w8_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6429,6 +6357,7 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst + 224), vavg7); } +// TODO: check which upsampling w16 ups4 version is faster and delete the obsolete one. static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -6583,89 +6512,6 @@ static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } -// Note: this alternate version is slower than the original version. It is kept here for reference. -static void mip_upsampling_w16_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) -{ - const uvg_pixel* src_ptr = src; - const uvg_pixel* dst_ptr = dst; - - const __m128i zeros = _mm_setzero_si128(); - const __m128i ones = _mm_set1_epi8(1); - const __m128i twos = _mm_set1_epi8(2); - const __m128i threes = _mm_set1_epi8(3); - const __m128i fours = _mm_set1_epi8(4); - const __m128i fives = _mm_set1_epi8(5); - const __m128i sixes = _mm_set1_epi8(6); - const __m128i sevens = _mm_set1_epi8(7); - const __m128i eights = _mm_set1_epi8(8); - - __m128i vbefore = _mm_load_si128((__m128i*)ref); - - for (int i = 0; i < 8; ++i) { - __m128i vbehind = _mm_loadu_si128((__m128i*)src_ptr); - - // Calculate the 7 interpolated lines between before and behind. Ordered by number from top to bottom. - __m128i vrow3 = _mm_avg_epu8(vbefore, vbehind); // Middle - __m128i vrow1 = _mm_avg_epu8(vrow3, vbefore); // Top middle - __m128i vrow5 = _mm_avg_epu8(vrow3, vbehind); // Bottom middle - __m128i vrow0 = _mm_avg_epu8(vbefore, vrow1); // Top middle top - __m128i vrow2 = _mm_avg_epu8(vrow1, vrow3); // Top middle bottom - __m128i vrow4 = _mm_avg_epu8(vrow3, vrow5); // Bottom middle top - __m128i vrow6 = _mm_avg_epu8(vrow5, vbehind); // Bottom middle bottom - - // Calculate the three and two last bits of difference between before and behind. These bits are used to determine if there will be rounding error. - __m128i diff = _mm_sub_epi8(vbehind, vbefore); - diff = _mm_and_si128(diff, sevens); - __m128i three_diff = _mm_and_si128(diff, threes); - - // Bottom side - __m128i mask = _mm_cmpgt_epi8(diff, fours); // The rounding error mask will be generated based on the calculated last bits. - __m128i sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 5, 6, 7 select one - vrow6 = _mm_sub_epi8(vrow6, sub_amount); - - mask = _mm_cmpeq_epi8(three_diff, threes); - sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 3 or 7 select one - vrow5 = _mm_sub_epi8(vrow5, sub_amount); - - __m128i is_two = _mm_cmpeq_epi8(diff, twos); - __m128i is_five = _mm_cmpeq_epi8(diff, fives); - mask = _mm_or_si128(mask, is_two); - mask = _mm_or_si128(mask, is_five); - sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 2, 3, 5, or 7 select one - vrow4 = _mm_sub_epi8(vrow4, sub_amount); - - // Top side - diff = _mm_blendv_epi8(diff, eights, _mm_cmpeq_epi8(zeros, diff)); // Replace zeros with eights to enable using GT - mask = _mm_cmpgt_epi8(diff, threes); - sub_amount = _mm_blendv_epi8(ones, zeros, mask); // If greater than three select zero - vrow0 = _mm_sub_epi8(vrow0, sub_amount); - - mask = _mm_cmpeq_epi8(three_diff, ones); - sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 1 or 5 select one - vrow1 = _mm_sub_epi8(vrow1, sub_amount); - - __m128i is_three = _mm_cmpeq_epi8(diff, threes); - __m128i is_six = _mm_cmpeq_epi8(diff, sixes); - mask = _mm_or_si128(mask, is_three); - mask = _mm_or_si128(mask, is_six); - sub_amount = _mm_blendv_epi8(zeros, ones, mask); // If 1, 3, 5, 6 select one - vrow2 = _mm_sub_epi8(vrow2, sub_amount); - - // Store results - _mm_store_si128((__m128i*)(dst_ptr + 0), vrow0); - _mm_store_si128((__m128i*)(dst_ptr + 16), vrow1); - _mm_store_si128((__m128i*)(dst_ptr + 32), vrow2); - _mm_store_si128((__m128i*)(dst_ptr + 48), vrow3); - _mm_store_si128((__m128i*)(dst_ptr + 64), vrow4); - _mm_store_si128((__m128i*)(dst_ptr + 80), vrow5); - _mm_store_si128((__m128i*)(dst_ptr + 96), vrow6); - - vbefore = vbehind; - src_ptr += 128; - dst_ptr += 128; - } -} - static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m256i vbefore = _mm256_loadu_si256((__m256i*)ref); @@ -6680,84 +6526,6 @@ static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix } } -static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 4; // height / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - - __m256i vbefore256a; - __m256i vbehind256a; - - __m256i vbefore256b; - __m256i vbehind256b; - - __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); - __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); - vbefore256a = _mm256_cvtepu8_epi16(vbeforea); - vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); - - for (int i = 0; i < 8; ++i) { - __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 128) + 0)); - __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 128) + 16)); - vbehind256a = _mm256_cvtepu8_epi16(vbehinda); - vbehind256b = _mm256_cvtepu8_epi16(vbehindb); - - // Calculate left side of 32 wide lane - __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); - - __m256i vrowleft0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowleft1 = _mm256_add_epi16(vrowleft0, vinterpolate); - __m256i vrowleft2 = _mm256_add_epi16(vrowleft1, vinterpolate); - - vrowleft0 = _mm256_srai_epi16(vrowleft0, log2_factor); - vrowleft1 = _mm256_srai_epi16(vrowleft1, log2_factor); - vrowleft2 = _mm256_srai_epi16(vrowleft2, log2_factor); - - - // Calculate right side of 32 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); - - __m256i vrowright0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowright1 = _mm256_add_epi16(vrowright0, vinterpolate); - __m256i vrowright2 = _mm256_add_epi16(vrowright1, vinterpolate); - - vrowright0 = _mm256_srai_epi16(vrowright0, log2_factor); - vrowright1 = _mm256_srai_epi16(vrowright1, log2_factor); - vrowright2 = _mm256_srai_epi16(vrowright2, log2_factor); - - - // Store results - __m256i vres0 = _mm256_packus_epi16(vrowleft0, vrowright0); - __m256i vres1 = _mm256_packus_epi16(vrowleft1, vrowright1); - __m256i vres2 = _mm256_packus_epi16(vrowleft2, vrowright2); - - vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); - _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1); - _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres2); - - vbefore256a = vbehind256a; - vbefore256b = vbehind256b; - } -} - static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; @@ -6804,6 +6572,7 @@ static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg } } +// TODO: check which upsampling w32 ups8 version is faster and delete the obsolete one. static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; @@ -7011,151 +6780,6 @@ static void mip_upsampling_w64_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix } } -static void mip_upsampling_w64_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 4; // height / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - - __m256i vbefore256a; - __m256i vbehind256a; - - __m256i vbefore256b; - __m256i vbehind256b; - - __m256i vbefore256c; - __m256i vbehind256c; - - __m256i vbefore256d; - __m256i vbehind256d; - - __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); - __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); - __m128i vbeforec = _mm_load_si128((__m128i*)(ref + 32)); - __m128i vbefored = _mm_load_si128((__m128i*)(ref + 48)); - vbefore256a = _mm256_cvtepu8_epi16(vbeforea); - vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); - vbefore256c = _mm256_cvtepu8_epi16(vbeforec); - vbefore256d = _mm256_cvtepu8_epi16(vbefored); - - for (int i = 0; i < 8; ++i) { - __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 256) + 0)); - __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 256) + 16)); - __m128i vbehindc = _mm_loadu_si128((__m128i*)(src + (i * 256) + 32)); - __m128i vbehindd = _mm_loadu_si128((__m128i*)(src + (i * 256) + 48)); - vbehind256a = _mm256_cvtepu8_epi16(vbehinda); - vbehind256b = _mm256_cvtepu8_epi16(vbehindb); - vbehind256c = _mm256_cvtepu8_epi16(vbehindc); - vbehind256d = _mm256_cvtepu8_epi16(vbehindd); - - // Calculate 1/4 part of 64 wide lane - __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); - - __m256i vrowa0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowa1 = _mm256_add_epi16(vrowa0, vinterpolate); - __m256i vrowa2 = _mm256_add_epi16(vrowa1, vinterpolate); - - vrowa0 = _mm256_srai_epi16(vrowa0, log2_factor); - vrowa1 = _mm256_srai_epi16(vrowa1, log2_factor); - vrowa2 = _mm256_srai_epi16(vrowa2, log2_factor); - - - // Calculate 2/4 part of 64 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); - - __m256i vrowb0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowb1 = _mm256_add_epi16(vrowb0, vinterpolate); - __m256i vrowb2 = _mm256_add_epi16(vrowb1, vinterpolate); - - vrowb0 = _mm256_srai_epi16(vrowb0, log2_factor); - vrowb1 = _mm256_srai_epi16(vrowb1, log2_factor); - vrowb2 = _mm256_srai_epi16(vrowb2, log2_factor); - - - // Calculate 3/4 part of 64 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256c, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256c, vbefore256c); - - __m256i vrowc0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowc1 = _mm256_add_epi16(vrowc0, vinterpolate); - __m256i vrowc2 = _mm256_add_epi16(vrowc1, vinterpolate); - - vrowc0 = _mm256_srai_epi16(vrowc0, log2_factor); - vrowc1 = _mm256_srai_epi16(vrowc1, log2_factor); - vrowc2 = _mm256_srai_epi16(vrowc2, log2_factor); - - - // Calculate 3/4 part of 64 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256d, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256d, vbefore256d); - - __m256i vrowd0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowd1 = _mm256_add_epi16(vrowd0, vinterpolate); - __m256i vrowd2 = _mm256_add_epi16(vrowd1, vinterpolate); - - vrowd0 = _mm256_srai_epi16(vrowd0, log2_factor); - vrowd1 = _mm256_srai_epi16(vrowd1, log2_factor); - vrowd2 = _mm256_srai_epi16(vrowd2, log2_factor); - - - // Store results - __m256i vres0left = _mm256_packus_epi16(vrowa0, vrowb0); - __m256i vres0right = _mm256_packus_epi16(vrowc0, vrowd0); - __m256i vres1left = _mm256_packus_epi16(vrowa1, vrowb1); - __m256i vres1right = _mm256_packus_epi16(vrowc1, vrowd1); - __m256i vres2left = _mm256_packus_epi16(vrowa2, vrowb2); - __m256i vres2right = _mm256_packus_epi16(vrowc2, vrowd2); - - /*vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0));*/ - - vres0left = _mm256_permute4x64_epi64(vres0left, _MM_SHUFFLE(3, 1, 2, 0)); - vres0right = _mm256_permute4x64_epi64(vres0right, _MM_SHUFFLE(3, 1, 2, 0)); - vres1left = _mm256_permute4x64_epi64(vres1left, _MM_SHUFFLE(3, 1, 2, 0)); - vres1right = _mm256_permute4x64_epi64(vres1right, _MM_SHUFFLE(3, 1, 2, 0)); - vres2left = _mm256_permute4x64_epi64(vres2left, _MM_SHUFFLE(3, 1, 2, 0)); - vres2right = _mm256_permute4x64_epi64(vres2right, _MM_SHUFFLE(3, 1, 2, 0)); - - /*_mm256_store_si256((__m256i*)(dst + (i * 128) + 0), vres0); - _mm256_store_si256((__m256i*)(dst + (i * 128) + 32), vres1); - _mm256_store_si256((__m256i*)(dst + (i * 128) + 64), vres2);*/ - - _mm256_store_si256((__m256i*)(dst + (i * 256) + 0), vres0left); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 32), vres0right); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 64), vres1left); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 96), vres1right); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 128), vres2left); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 160), vres2right); - - vbefore256a = vbehind256a; - vbefore256b = vbehind256b; - vbefore256c = vbehind256c; - vbefore256d = vbehind256d; - } -} - static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; @@ -7235,6 +6859,7 @@ static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg } } +// TODO: check which upsampling w64 ups8 version is faster and delete the obsolete one. static void mip_upsampling_w64_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uint8_t red_pred_size = 8; From 244d263b8178a8f4bab74ebd71daba56b58ca293 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:44:37 +0300 Subject: [PATCH 231/237] Remove unused mip upsampling code. --- src/strategies/avx2/intra-avx2.c | 738 +------------------------------ 1 file changed, 12 insertions(+), 726 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index fe2f06eb..5ebc642d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4280,373 +4280,16 @@ static void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, } -// Calculate the DC value for a 4x4 block. The algorithm uses slightly -// different addends, multipliers etc for different pixels in the block, -// but for a fixed-size implementation one vector wide, all the weights, -// addends etc can be preinitialized for each position. -static void pred_filtered_dc_4x4(const uint8_t *ref_top, - const uint8_t *ref_left, - uint8_t *out_block, - const uint8_t multi_ref_idx) -{ - const uint32_t rt_u32 = *(const uint32_t *)(ref_top + 1); - const uint32_t rl_u32 = *(const uint32_t *)(ref_left + 1); - - const __m128i zero = _mm_setzero_si128(); - const __m128i twos = _mm_set1_epi8(2); - - // Hack. Move 4 u8's to bit positions 0, 64, 128 and 192 in two regs, to - // expand them to 16 bits sort of "for free". Set highest bits on all the - // other bytes in vectors to zero those bits in the result vector. - const __m128i rl_shuf_lo = _mm_setr_epi32(0x80808000, 0x80808080, - 0x80808001, 0x80808080); - const __m128i rl_shuf_hi = _mm_add_epi8 (rl_shuf_lo, twos); - - // Every second multiplier is 1, because we want maddubs to calculate - // a + bc = 1 * a + bc (actually 2 + bc). We need to fill a vector with - // ((u8)2)'s for other stuff anyway, so that can also be used here. - const __m128i mult_lo = _mm_setr_epi32(0x01030102, 0x01030103, - 0x01040103, 0x01040104); - const __m128i mult_hi = _mm_setr_epi32(0x01040103, 0x01040104, - 0x01040103, 0x01040104); - __m128i four = _mm_cvtsi32_si128 (4); - __m128i rt = _mm_cvtsi32_si128 (rt_u32); - __m128i rl = _mm_cvtsi32_si128 (rl_u32); - __m128i rtrl = _mm_unpacklo_epi32 (rt, rl); - - __m128i sad0 = _mm_sad_epu8 (rtrl, zero); - __m128i sad1 = _mm_shuffle_epi32 (sad0, _MM_SHUFFLE(1, 0, 3, 2)); - __m128i sad2 = _mm_add_epi64 (sad0, sad1); - __m128i sad3 = _mm_add_epi64 (sad2, four); - - __m128i dc_64 = _mm_srli_epi64 (sad3, 3); - __m128i dc_8 = _mm_broadcastb_epi8(dc_64); - - __m128i rl_lo = _mm_shuffle_epi8 (rl, rl_shuf_lo); - __m128i rl_hi = _mm_shuffle_epi8 (rl, rl_shuf_hi); - - __m128i rt_lo = _mm_unpacklo_epi8 (rt, zero); - __m128i rt_hi = zero; - - __m128i dc_addend = _mm_unpacklo_epi8(dc_8, twos); - - __m128i dc_multd_lo = _mm_maddubs_epi16(dc_addend, mult_lo); - __m128i dc_multd_hi = _mm_maddubs_epi16(dc_addend, mult_hi); - - __m128i rl_rt_lo = _mm_add_epi16 (rl_lo, rt_lo); - __m128i rl_rt_hi = _mm_add_epi16 (rl_hi, rt_hi); - - __m128i res_lo = _mm_add_epi16 (dc_multd_lo, rl_rt_lo); - __m128i res_hi = _mm_add_epi16 (dc_multd_hi, rl_rt_hi); - - res_lo = _mm_srli_epi16 (res_lo, 2); - res_hi = _mm_srli_epi16 (res_hi, 2); - - __m128i final = _mm_packus_epi16 (res_lo, res_hi); - _mm_storeu_si128((__m128i *)out_block, final); -} - -static void pred_filtered_dc_8x8(const uint8_t *ref_top, - const uint8_t *ref_left, - uint8_t *out_block, - const uint8_t multi_ref_idx) -{ - const uint64_t rt_u64 = *(const uint64_t *)(ref_top + 1); - const uint64_t rl_u64 = *(const uint64_t *)(ref_left + 1); - - const __m128i zero128 = _mm_setzero_si128(); - const __m256i twos = _mm256_set1_epi8(2); - - // DC multiplier is 2 at (0, 0), 3 at (*, 0) and (0, *), and 4 at (*, *). - // There is a constant addend of 2 on each pixel, use values from the twos - // register and multipliers of 1 for that, to use maddubs for an (a*b)+c - // operation. - const __m256i mult_up_lo = _mm256_setr_epi32(0x01030102, 0x01030103, - 0x01030103, 0x01030103, - 0x01040103, 0x01040104, - 0x01040104, 0x01040104); - - // The 6 lowest rows have same multipliers, also the DC values and addends - // are the same so this works for all of those - const __m256i mult_rest = _mm256_permute4x64_epi64(mult_up_lo, _MM_SHUFFLE(3, 2, 3, 2)); - - // Every 8-pixel row starts with the next pixel of ref_left. Along with - // doing the shuffling, also expand u8->u16, ie. move bytes 0 and 1 from - // ref_left to bit positions 0 and 128 in rl_up_lo, 2 and 3 to rl_up_hi, - // etc. The places to be zeroed out are 0x80 instead of the usual 0xff, - // because this allows us to form new masks on the fly by adding 0x02-bytes - // to this mask and still retain the highest bits as 1 where things should - // be zeroed out. - const __m256i rl_shuf_up_lo = _mm256_setr_epi32(0x80808000, 0x80808080, - 0x80808080, 0x80808080, - 0x80808001, 0x80808080, - 0x80808080, 0x80808080); - // And don't waste memory or architectural regs, hope these instructions - // will be placed in between the shuffles by the compiler to only use one - // register for the shufmasks, and executed way ahead of time because their - // regs can be renamed. - const __m256i rl_shuf_up_hi = _mm256_add_epi8 (rl_shuf_up_lo, twos); - const __m256i rl_shuf_dn_lo = _mm256_add_epi8 (rl_shuf_up_hi, twos); - const __m256i rl_shuf_dn_hi = _mm256_add_epi8 (rl_shuf_dn_lo, twos); - - __m128i eight = _mm_cvtsi32_si128 (8); - __m128i rt = _mm_cvtsi64_si128 (rt_u64); - __m128i rl = _mm_cvtsi64_si128 (rl_u64); - __m128i rtrl = _mm_unpacklo_epi64 (rt, rl); - - __m128i sad0 = _mm_sad_epu8 (rtrl, zero128); - __m128i sad1 = _mm_shuffle_epi32 (sad0, _MM_SHUFFLE(1, 0, 3, 2)); - __m128i sad2 = _mm_add_epi64 (sad0, sad1); - __m128i sad3 = _mm_add_epi64 (sad2, eight); - - __m128i dc_64 = _mm_srli_epi64 (sad3, 4); - __m256i dc_8 = _mm256_broadcastb_epi8(dc_64); - - __m256i dc_addend = _mm256_unpacklo_epi8 (dc_8, twos); - - __m256i dc_up_lo = _mm256_maddubs_epi16 (dc_addend, mult_up_lo); - __m256i dc_rest = _mm256_maddubs_epi16 (dc_addend, mult_rest); - - // rt_dn is all zeros, as is rt_up_hi. This'll get us the rl and rt parts - // in A|B, C|D order instead of A|C, B|D that could be packed into abcd - // order, so these need to be permuted before adding to the weighed DC - // values. - __m256i rt_up_lo = _mm256_cvtepu8_epi16 (rt); - - __m256i rlrlrlrl = _mm256_broadcastq_epi64(rl); - __m256i rl_up_lo = _mm256_shuffle_epi8 (rlrlrlrl, rl_shuf_up_lo); - - // Everything ref_top is zero except on the very first row - __m256i rt_rl_up_hi = _mm256_shuffle_epi8 (rlrlrlrl, rl_shuf_up_hi); - __m256i rt_rl_dn_lo = _mm256_shuffle_epi8 (rlrlrlrl, rl_shuf_dn_lo); - __m256i rt_rl_dn_hi = _mm256_shuffle_epi8 (rlrlrlrl, rl_shuf_dn_hi); - - __m256i rt_rl_up_lo = _mm256_add_epi16 (rt_up_lo, rl_up_lo); - - __m256i rt_rl_up_lo_2 = _mm256_permute2x128_si256(rt_rl_up_lo, rt_rl_up_hi, 0x20); - __m256i rt_rl_up_hi_2 = _mm256_permute2x128_si256(rt_rl_up_lo, rt_rl_up_hi, 0x31); - __m256i rt_rl_dn_lo_2 = _mm256_permute2x128_si256(rt_rl_dn_lo, rt_rl_dn_hi, 0x20); - __m256i rt_rl_dn_hi_2 = _mm256_permute2x128_si256(rt_rl_dn_lo, rt_rl_dn_hi, 0x31); - - __m256i up_lo = _mm256_add_epi16(rt_rl_up_lo_2, dc_up_lo); - __m256i up_hi = _mm256_add_epi16(rt_rl_up_hi_2, dc_rest); - __m256i dn_lo = _mm256_add_epi16(rt_rl_dn_lo_2, dc_rest); - __m256i dn_hi = _mm256_add_epi16(rt_rl_dn_hi_2, dc_rest); - - up_lo = _mm256_srli_epi16(up_lo, 2); - up_hi = _mm256_srli_epi16(up_hi, 2); - dn_lo = _mm256_srli_epi16(dn_lo, 2); - dn_hi = _mm256_srli_epi16(dn_hi, 2); - - __m256i res_up = _mm256_packus_epi16(up_lo, up_hi); - __m256i res_dn = _mm256_packus_epi16(dn_lo, dn_hi); - - _mm256_storeu_si256(((__m256i *)out_block) + 0, res_up); - _mm256_storeu_si256(((__m256i *)out_block) + 1, res_dn); -} - -static INLINE __m256i cvt_u32_si256(const uint32_t u) -{ - const __m256i zero = _mm256_setzero_si256(); - return _mm256_insert_epi32(zero, u, 0); -} - -static void pred_filtered_dc_16x16(const uint8_t *ref_top, - const uint8_t *ref_left, - uint8_t *out_block, - const uint8_t multi_ref_idx) -{ - const __m128i rt_128 = _mm_loadu_si128((const __m128i *)(ref_top + 1)); - const __m128i rl_128 = _mm_loadu_si128((const __m128i *)(ref_left + 1)); - - const __m128i zero_128 = _mm_setzero_si128(); - const __m256i zero = _mm256_setzero_si256(); - const __m256i twos = _mm256_set1_epi8(2); - - const __m256i mult_r0 = _mm256_setr_epi32(0x01030102, 0x01030103, - 0x01030103, 0x01030103, - 0x01030103, 0x01030103, - 0x01030103, 0x01030103); - - const __m256i mult_left = _mm256_set1_epi16(0x0103); - - // Leftmost bytes' blend mask, to move bytes (pixels) from the leftmost - // column vector to the result row - const __m256i lm8_bmask = _mm256_setr_epi32(0xff, 0, 0, 0, 0xff, 0, 0, 0); - - __m128i sixteen = _mm_cvtsi32_si128(16); - __m128i sad0_t = _mm_sad_epu8 (rt_128, zero_128); - __m128i sad0_l = _mm_sad_epu8 (rl_128, zero_128); - __m128i sad0 = _mm_add_epi64(sad0_t, sad0_l); - - __m128i sad1 = _mm_shuffle_epi32 (sad0, _MM_SHUFFLE(1, 0, 3, 2)); - __m128i sad2 = _mm_add_epi64 (sad0, sad1); - __m128i sad3 = _mm_add_epi64 (sad2, sixteen); - - __m128i dc_64 = _mm_srli_epi64 (sad3, 5); - __m256i dc_8 = _mm256_broadcastb_epi8 (dc_64); - - __m256i rt = _mm256_cvtepu8_epi16 (rt_128); - __m256i rl = _mm256_cvtepu8_epi16 (rl_128); - - uint8_t rl0 = *(uint8_t *)(ref_left + 1); - __m256i rl_r0 = cvt_u32_si256((uint32_t)rl0); - - __m256i rlrt_r0 = _mm256_add_epi16(rl_r0, rt); - - __m256i dc_addend = _mm256_unpacklo_epi8(dc_8, twos); - __m256i r0 = _mm256_maddubs_epi16(dc_addend, mult_r0); - __m256i left_dcs = _mm256_maddubs_epi16(dc_addend, mult_left); - - r0 = _mm256_add_epi16 (r0, rlrt_r0); - r0 = _mm256_srli_epi16 (r0, 2); - __m256i r0r0 = _mm256_packus_epi16 (r0, r0); - r0r0 = _mm256_permute4x64_epi64(r0r0, _MM_SHUFFLE(3, 1, 2, 0)); - - __m256i leftmosts = _mm256_add_epi16 (left_dcs, rl); - leftmosts = _mm256_srli_epi16 (leftmosts, 2); - - // Contain the leftmost column's bytes in both lanes of lm_8 - __m256i lm_8 = _mm256_packus_epi16 (leftmosts, zero); - lm_8 = _mm256_permute4x64_epi64(lm_8, _MM_SHUFFLE(2, 0, 2, 0)); - - __m256i lm8_r1 = _mm256_srli_epi32 (lm_8, 8); - __m256i r1r1 = _mm256_blendv_epi8 (dc_8, lm8_r1, lm8_bmask); - __m256i r0r1 = _mm256_blend_epi32 (r0r0, r1r1, 0xf0); - - _mm256_storeu_si256((__m256i *)out_block, r0r1); - - // Starts from 2 because row 0 (and row 1) is handled separately - __m256i lm8_l = _mm256_bsrli_epi128 (lm_8, 2); - __m256i lm8_h = _mm256_bsrli_epi128 (lm_8, 3); - lm_8 = _mm256_blend_epi32 (lm8_l, lm8_h, 0xf0); - - for (uint32_t y = 2; y < 16; y += 2) { - __m256i curr_row = _mm256_blendv_epi8 (dc_8, lm_8, lm8_bmask); - _mm256_storeu_si256((__m256i *)(out_block + (y << 4)), curr_row); - lm_8 = _mm256_bsrli_epi128(lm_8, 2); - } -} - -static void pred_filtered_dc_32x32(const uint8_t *ref_top, - const uint8_t *ref_left, - uint8_t *out_block, - const uint8_t multi_ref_idx) -{ - const __m256i rt = _mm256_loadu_si256((const __m256i *)(ref_top + 1)); - const __m256i rl = _mm256_loadu_si256((const __m256i *)(ref_left + 1)); - - const __m256i zero = _mm256_setzero_si256(); - const __m256i twos = _mm256_set1_epi8(2); - - const __m256i mult_r0lo = _mm256_setr_epi32(0x01030102, 0x01030103, - 0x01030103, 0x01030103, - 0x01030103, 0x01030103, - 0x01030103, 0x01030103); - - const __m256i mult_left = _mm256_set1_epi16(0x0103); - const __m256i lm8_bmask = cvt_u32_si256 (0xff); - - const __m256i bshif_msk = _mm256_setr_epi32(0x04030201, 0x08070605, - 0x0c0b0a09, 0x800f0e0d, - 0x03020100, 0x07060504, - 0x0b0a0908, 0x0f0e0d0c); - __m256i debias = cvt_u32_si256(32); - __m256i sad0_t = _mm256_sad_epu8 (rt, zero); - __m256i sad0_l = _mm256_sad_epu8 (rl, zero); - __m256i sad0 = _mm256_add_epi64 (sad0_t, sad0_l); - - __m256i sad1 = _mm256_permute4x64_epi64(sad0, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i sad2 = _mm256_add_epi64 (sad0, sad1); - __m256i sad3 = _mm256_shuffle_epi32 (sad2, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i sad4 = _mm256_add_epi64 (sad2, sad3); - __m256i sad5 = _mm256_add_epi64 (sad4, debias); - __m256i dc_64 = _mm256_srli_epi64 (sad5, 6); - - __m128i dc_64_ = _mm256_castsi256_si128 (dc_64); - __m256i dc_8 = _mm256_broadcastb_epi8 (dc_64_); - - __m256i rtlo = _mm256_unpacklo_epi8 (rt, zero); - __m256i rllo = _mm256_unpacklo_epi8 (rl, zero); - __m256i rthi = _mm256_unpackhi_epi8 (rt, zero); - __m256i rlhi = _mm256_unpackhi_epi8 (rl, zero); - - __m256i dc_addend = _mm256_unpacklo_epi8 (dc_8, twos); - __m256i r0lo = _mm256_maddubs_epi16 (dc_addend, mult_r0lo); - __m256i r0hi = _mm256_maddubs_epi16 (dc_addend, mult_left); - __m256i c0dc = r0hi; - - r0lo = _mm256_add_epi16 (r0lo, rtlo); - r0hi = _mm256_add_epi16 (r0hi, rthi); - - __m256i rlr0 = _mm256_blendv_epi8 (zero, rl, lm8_bmask); - r0lo = _mm256_add_epi16 (r0lo, rlr0); - - r0lo = _mm256_srli_epi16 (r0lo, 2); - r0hi = _mm256_srli_epi16 (r0hi, 2); - __m256i r0 = _mm256_packus_epi16 (r0lo, r0hi); - - _mm256_storeu_si256((__m256i *)out_block, r0); - - __m256i c0lo = _mm256_add_epi16 (c0dc, rllo); - __m256i c0hi = _mm256_add_epi16 (c0dc, rlhi); - c0lo = _mm256_srli_epi16 (c0lo, 2); - c0hi = _mm256_srli_epi16 (c0hi, 2); - - __m256i c0 = _mm256_packus_epi16 (c0lo, c0hi); - - // r0 already handled! - for (uint32_t y = 1; y < 32; y++) { - if (y == 16) { - c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2)); - } else { - c0 = _mm256_shuffle_epi8 (c0, bshif_msk); - } - __m256i curr_row = _mm256_blendv_epi8 (dc_8, c0, lm8_bmask); - _mm256_storeu_si256(((__m256i *)out_block) + y, curr_row); - } -} - -/** -* \brief Generage intra DC prediction with post filtering applied. -* \param log2_width Log2 of width, range 2..5. -* \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. -* \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. -* \param dst Buffer of size width*width. -* \param multi_ref_idx Reference line index. May be non-zero when MRL is used. -*/ -static void uvg_intra_pred_filtered_dc_avx2( - const int_fast8_t log2_width, - const uint8_t *ref_top, - const uint8_t *ref_left, - uint8_t *out_block, - const uint8_t multi_ref_idx) -{ - assert(log2_width >= 2 && log2_width <= 5); - - // TODO: implement multi reference index for all subfunctions - if (log2_width == 2) { - pred_filtered_dc_4x4(ref_top, ref_left, out_block, multi_ref_idx); - } else if (log2_width == 3) { - pred_filtered_dc_8x8(ref_top, ref_left, out_block, multi_ref_idx); - } else if (log2_width == 4) { - pred_filtered_dc_16x16(ref_top, ref_left, out_block, multi_ref_idx); - } else if (log2_width == 5) { - pred_filtered_dc_32x32(ref_top, ref_left, out_block, multi_ref_idx); - } -} - -// TODO: update all ranges (in comments, etc.) from HEVC to VVC - /** * \brief Position Dependent Prediction Combination for Planar and DC modes. -* \param log2_width Log2 of width, range 2..5. -* \param width Block width matching log2_width. -* \param used_ref Pointer used reference pixel struct. -* \param dst Buffer of size width*width. +* \param mode Intra mode, 0 for planar, 1 for DC. +* \param cu_loc Pointer to the CU location information. +* \param color Color component. +* \param used_ref Pointer to the used reference pixels. +* \param dst Buffer of size MAX_PRED_WIDTH * MAX_PRED_WIDTH. */ -// TODO: does not work with blocks with height 1 and 2 -// TODO: also has width someplaces where height should be +// TODO: allegedly does not work with blocks with height 1 and 2. Test this. +// TODO: or just rework the whole thing. We might be able to optimize this further. static void uvg_pdpc_planar_dc_avx2( const int mode, const cu_loc_t* const cu_loc, @@ -4654,7 +4297,6 @@ static void uvg_pdpc_planar_dc_avx2( const uvg_intra_ref *const used_ref, uvg_pixel *const dst) { - // ISP_TODO: non-square block implementation, height is passed but not used assert(mode == 0 || mode == 1); // planar or DC const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; @@ -4675,7 +4317,6 @@ static void uvg_pdpc_planar_dc_avx2( 6, 7, 6, 7, 6, 7, 6, 7 ); - // TODO: replace latter log2_width with log2_height const int scale = ((log2_width - 2 + log2_height - 2 + 2) >> 2); // Same weights regardless of axis, compute once @@ -6068,6 +5709,7 @@ static void mip_upsampling_w4_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixe _mm256_store_si256((__m256i*)(dst + 96), vres3); } + static void mip_upsampling_w8_ups2_h8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { int64_t refline = *(int64_t*)ref; @@ -6318,6 +5960,7 @@ static void mip_upsampling_w8_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixe } } + static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m128i vbehind0 = _mm_loadu_si128((__m128i*)(src + 0)); @@ -6357,54 +6000,6 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst + 224), vavg7); } -// TODO: check which upsampling w16 ups4 version is faster and delete the obsolete one. -static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 4; // height / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - - __m256i vbefore256; - __m256i vbehind256; - - __m128i vbefore = _mm_loadu_si128((__m128i*)ref); - vbefore256 = _mm256_cvtepu8_epi16(vbefore); - - for (int i = 0; i < 8; ++i) { - __m128i vbehind = _mm_loadu_si128((__m128i*)(src + (i * 64))); - vbehind256 = _mm256_cvtepu8_epi16(vbehind); - - __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - __m256i vinterpolate = _mm256_sub_epi16(vbehind256, vbefore256); - - __m256i vrow0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrow1 = _mm256_add_epi16(vrow0, vinterpolate); - __m256i vrow2 = _mm256_add_epi16(vrow1, vinterpolate); - - vrow0 = _mm256_srai_epi16(vrow0, log2_factor); - vrow1 = _mm256_srai_epi16(vrow1, log2_factor); - vrow2 = _mm256_srai_epi16(vrow2, log2_factor); - - __m256i vres0 = _mm256_packus_epi16(vrow0, vrow1); - __m256i vres1 = _mm256_packus_epi16(vrow2, vbehind256); - - vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_storeu_si256((__m256i*)(dst + (i * 64) + 0), vres0); - _mm256_storeu_si256((__m256i*)(dst + (i * 64) + 32), vres1); - - vbefore256 = vbehind256; - } -} - static void mip_upsampling_w16_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; @@ -6512,6 +6107,7 @@ static void mip_upsampling_w16_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pix } } + static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m256i vbefore = _mm256_loadu_si256((__m256i*)ref); @@ -6572,113 +6168,6 @@ static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg } } -// TODO: check which upsampling w32 ups8 version is faster and delete the obsolete one. -static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 8; // height / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - - __m256i vbefore256a; - __m256i vbehind256a; - - __m256i vbefore256b; - __m256i vbehind256b; - - __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); - __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); - vbefore256a = _mm256_cvtepu8_epi16(vbeforea); - vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); - - for (int i = 0; i < 8; ++i) { - __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 256) + 0)); - __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 256) + 16)); - vbehind256a = _mm256_cvtepu8_epi16(vbehinda); - vbehind256b = _mm256_cvtepu8_epi16(vbehindb); - - // Calculate left side of 32 wide lane - __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); - - __m256i vrowleft0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowleft1 = _mm256_add_epi16(vrowleft0, vinterpolate); - __m256i vrowleft2 = _mm256_add_epi16(vrowleft1, vinterpolate); - __m256i vrowleft3 = _mm256_add_epi16(vrowleft2, vinterpolate); - __m256i vrowleft4 = _mm256_add_epi16(vrowleft3, vinterpolate); - __m256i vrowleft5 = _mm256_add_epi16(vrowleft4, vinterpolate); - __m256i vrowleft6 = _mm256_add_epi16(vrowleft5, vinterpolate); - - vrowleft0 = _mm256_srai_epi16(vrowleft0, log2_factor); - vrowleft1 = _mm256_srai_epi16(vrowleft1, log2_factor); - vrowleft2 = _mm256_srai_epi16(vrowleft2, log2_factor); - vrowleft3 = _mm256_srai_epi16(vrowleft3, log2_factor); - vrowleft4 = _mm256_srai_epi16(vrowleft4, log2_factor); - vrowleft5 = _mm256_srai_epi16(vrowleft5, log2_factor); - vrowleft6 = _mm256_srai_epi16(vrowleft6, log2_factor); - - - // Calculate right side of 32 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); - - __m256i vrowright0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowright1 = _mm256_add_epi16(vrowright0, vinterpolate); - __m256i vrowright2 = _mm256_add_epi16(vrowright1, vinterpolate); - __m256i vrowright3 = _mm256_add_epi16(vrowright2, vinterpolate); - __m256i vrowright4 = _mm256_add_epi16(vrowright3, vinterpolate); - __m256i vrowright5 = _mm256_add_epi16(vrowright4, vinterpolate); - __m256i vrowright6 = _mm256_add_epi16(vrowright5, vinterpolate); - - vrowright0 = _mm256_srai_epi16(vrowright0, log2_factor); - vrowright1 = _mm256_srai_epi16(vrowright1, log2_factor); - vrowright2 = _mm256_srai_epi16(vrowright2, log2_factor); - vrowright3 = _mm256_srai_epi16(vrowright3, log2_factor); - vrowright4 = _mm256_srai_epi16(vrowright4, log2_factor); - vrowright5 = _mm256_srai_epi16(vrowright5, log2_factor); - vrowright6 = _mm256_srai_epi16(vrowright6, log2_factor); - - - // Store results - __m256i vres0 = _mm256_packus_epi16(vrowleft0, vrowright0); - __m256i vres1 = _mm256_packus_epi16(vrowleft1, vrowright1); - __m256i vres2 = _mm256_packus_epi16(vrowleft2, vrowright2); - __m256i vres3 = _mm256_packus_epi16(vrowleft3, vrowright3); - __m256i vres4 = _mm256_packus_epi16(vrowleft4, vrowright4); - __m256i vres5 = _mm256_packus_epi16(vrowleft5, vrowright5); - __m256i vres6 = _mm256_packus_epi16(vrowleft6, vrowright6); - - vres0 = _mm256_permute4x64_epi64(vres0, _MM_SHUFFLE(3, 1, 2, 0)); - vres1 = _mm256_permute4x64_epi64(vres1, _MM_SHUFFLE(3, 1, 2, 0)); - vres2 = _mm256_permute4x64_epi64(vres2, _MM_SHUFFLE(3, 1, 2, 0)); - vres3 = _mm256_permute4x64_epi64(vres3, _MM_SHUFFLE(3, 1, 2, 0)); - vres4 = _mm256_permute4x64_epi64(vres4, _MM_SHUFFLE(3, 1, 2, 0)); - vres5 = _mm256_permute4x64_epi64(vres5, _MM_SHUFFLE(3, 1, 2, 0)); - vres6 = _mm256_permute4x64_epi64(vres6, _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_store_si256((__m256i*)(dst + (i * 256) + 0), vres0); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 32), vres1); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 64), vres2); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 96), vres3); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 128), vres4); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 160), vres5); - _mm256_store_si256((__m256i*)(dst + (i * 256) + 192), vres6); - - vbefore256a = vbehind256a; - vbefore256b = vbehind256b; - } -} - static void mip_upsampling_w32_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; @@ -6761,6 +6250,7 @@ static void mip_upsampling_w32_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg } } + static void mip_upsampling_w64_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { __m256i vbeforeleft = _mm256_load_si256((__m256i*)(ref + 0)); @@ -6859,209 +6349,6 @@ static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg } } -// TODO: check which upsampling w64 ups8 version is faster and delete the obsolete one. -static void mip_upsampling_w64_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) -{ - const uint8_t red_pred_size = 8; - const uint8_t ups_factor = 8; // height / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - const int rounding_offset = 1 << (log2_factor - 1); - __m256i vrnd = _mm256_set1_epi16(rounding_offset); - - __m256i vbefore256a; - __m256i vbehind256a; - - __m256i vbefore256b; - __m256i vbehind256b; - - __m256i vbefore256c; - __m256i vbehind256c; - - __m256i vbefore256d; - __m256i vbehind256d; - - __m128i vbeforea = _mm_load_si128((__m128i*)(ref + 0)); - __m128i vbeforeb = _mm_load_si128((__m128i*)(ref + 16)); - __m128i vbeforec = _mm_load_si128((__m128i*)(ref + 32)); - __m128i vbefored = _mm_load_si128((__m128i*)(ref + 48)); - vbefore256a = _mm256_cvtepu8_epi16(vbeforea); - vbefore256b = _mm256_cvtepu8_epi16(vbeforeb); - vbefore256c = _mm256_cvtepu8_epi16(vbeforec); - vbefore256d = _mm256_cvtepu8_epi16(vbefored); - - for (int i = 0; i < 8; ++i) { - __m128i vbehinda = _mm_loadu_si128((__m128i*)(src + (i * 512) + 0)); - __m128i vbehindb = _mm_loadu_si128((__m128i*)(src + (i * 512) + 16)); - __m128i vbehindc = _mm_loadu_si128((__m128i*)(src + (i * 512) + 32)); - __m128i vbehindd = _mm_loadu_si128((__m128i*)(src + (i * 512) + 48)); - vbehind256a = _mm256_cvtepu8_epi16(vbehinda); - vbehind256b = _mm256_cvtepu8_epi16(vbehindb); - vbehind256c = _mm256_cvtepu8_epi16(vbehindc); - vbehind256d = _mm256_cvtepu8_epi16(vbehindd); - - // Calculate 1/4 part of 64 wide lane - __m256i vbeforeshifted = _mm256_slli_epi16(vbefore256a, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - __m256i vinterpolate = _mm256_sub_epi16(vbehind256a, vbefore256a); - - __m256i vrowa0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowa1 = _mm256_add_epi16(vrowa0, vinterpolate); - __m256i vrowa2 = _mm256_add_epi16(vrowa1, vinterpolate); - __m256i vrowa3 = _mm256_add_epi16(vrowa2, vinterpolate); - __m256i vrowa4 = _mm256_add_epi16(vrowa3, vinterpolate); - __m256i vrowa5 = _mm256_add_epi16(vrowa4, vinterpolate); - __m256i vrowa6 = _mm256_add_epi16(vrowa5, vinterpolate); - - vrowa0 = _mm256_srai_epi16(vrowa0, log2_factor); - vrowa1 = _mm256_srai_epi16(vrowa1, log2_factor); - vrowa2 = _mm256_srai_epi16(vrowa2, log2_factor); - vrowa3 = _mm256_srai_epi16(vrowa3, log2_factor); - vrowa4 = _mm256_srai_epi16(vrowa4, log2_factor); - vrowa5 = _mm256_srai_epi16(vrowa5, log2_factor); - vrowa6 = _mm256_srai_epi16(vrowa6, log2_factor); - - - // Calculate 2/4 part of 64 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256b, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256b, vbefore256b); - - __m256i vrowb0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowb1 = _mm256_add_epi16(vrowb0, vinterpolate); - __m256i vrowb2 = _mm256_add_epi16(vrowb1, vinterpolate); - __m256i vrowb3 = _mm256_add_epi16(vrowb2, vinterpolate); - __m256i vrowb4 = _mm256_add_epi16(vrowb3, vinterpolate); - __m256i vrowb5 = _mm256_add_epi16(vrowb4, vinterpolate); - __m256i vrowb6 = _mm256_add_epi16(vrowb5, vinterpolate); - - vrowb0 = _mm256_srai_epi16(vrowb0, log2_factor); - vrowb1 = _mm256_srai_epi16(vrowb1, log2_factor); - vrowb2 = _mm256_srai_epi16(vrowb2, log2_factor); - vrowb3 = _mm256_srai_epi16(vrowb3, log2_factor); - vrowb4 = _mm256_srai_epi16(vrowb4, log2_factor); - vrowb5 = _mm256_srai_epi16(vrowb5, log2_factor); - vrowb6 = _mm256_srai_epi16(vrowb6, log2_factor); - - - // Calculate 3/4 part of 64 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256c, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256c, vbefore256c); - - __m256i vrowc0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowc1 = _mm256_add_epi16(vrowc0, vinterpolate); - __m256i vrowc2 = _mm256_add_epi16(vrowc1, vinterpolate); - __m256i vrowc3 = _mm256_add_epi16(vrowc2, vinterpolate); - __m256i vrowc4 = _mm256_add_epi16(vrowc3, vinterpolate); - __m256i vrowc5 = _mm256_add_epi16(vrowc4, vinterpolate); - __m256i vrowc6 = _mm256_add_epi16(vrowc5, vinterpolate); - - vrowc0 = _mm256_srai_epi16(vrowc0, log2_factor); - vrowc1 = _mm256_srai_epi16(vrowc1, log2_factor); - vrowc2 = _mm256_srai_epi16(vrowc2, log2_factor); - vrowc3 = _mm256_srai_epi16(vrowc3, log2_factor); - vrowc4 = _mm256_srai_epi16(vrowc4, log2_factor); - vrowc5 = _mm256_srai_epi16(vrowc5, log2_factor); - vrowc6 = _mm256_srai_epi16(vrowc6, log2_factor); - - - // Calculate 3/4 part of 64 wide lane - vbeforeshifted = _mm256_slli_epi16(vbefore256d, log2_factor); - - // Add rounding offset - vbeforeshifted = _mm256_add_epi16(vbeforeshifted, vrnd); - - vinterpolate = _mm256_sub_epi16(vbehind256d, vbefore256d); - - __m256i vrowd0 = _mm256_add_epi16(vbeforeshifted, vinterpolate); - __m256i vrowd1 = _mm256_add_epi16(vrowd0, vinterpolate); - __m256i vrowd2 = _mm256_add_epi16(vrowd1, vinterpolate); - __m256i vrowd3 = _mm256_add_epi16(vrowd2, vinterpolate); - __m256i vrowd4 = _mm256_add_epi16(vrowd3, vinterpolate); - __m256i vrowd5 = _mm256_add_epi16(vrowd4, vinterpolate); - __m256i vrowd6 = _mm256_add_epi16(vrowd5, vinterpolate); - - vrowd0 = _mm256_srai_epi16(vrowd0, log2_factor); - vrowd1 = _mm256_srai_epi16(vrowd1, log2_factor); - vrowd2 = _mm256_srai_epi16(vrowd2, log2_factor); - vrowd3 = _mm256_srai_epi16(vrowd3, log2_factor); - vrowd4 = _mm256_srai_epi16(vrowd4, log2_factor); - vrowd5 = _mm256_srai_epi16(vrowd5, log2_factor); - vrowd6 = _mm256_srai_epi16(vrowd6, log2_factor); - - - // Store results - __m256i vres00 = _mm256_packus_epi16(vrowa0, vrowb0); - __m256i vres01 = _mm256_packus_epi16(vrowc0, vrowd0); - - __m256i vres10 = _mm256_packus_epi16(vrowa1, vrowb1); - __m256i vres11 = _mm256_packus_epi16(vrowc1, vrowd1); - - __m256i vres20 = _mm256_packus_epi16(vrowa2, vrowb2); - __m256i vres21 = _mm256_packus_epi16(vrowc2, vrowd2); - - __m256i vres30 = _mm256_packus_epi16(vrowa3, vrowb3); - __m256i vres31 = _mm256_packus_epi16(vrowc3, vrowd3); - - __m256i vres40 = _mm256_packus_epi16(vrowa4, vrowb4); - __m256i vres41 = _mm256_packus_epi16(vrowc4, vrowd4); - - __m256i vres50 = _mm256_packus_epi16(vrowa5, vrowb5); - __m256i vres51 = _mm256_packus_epi16(vrowc5, vrowd5); - - __m256i vres60 = _mm256_packus_epi16(vrowa6, vrowb6); - __m256i vres61 = _mm256_packus_epi16(vrowc6, vrowd6); - - - vres00 = _mm256_permute4x64_epi64(vres00, _MM_SHUFFLE(3, 1, 2, 0)); - vres01 = _mm256_permute4x64_epi64(vres01, _MM_SHUFFLE(3, 1, 2, 0)); - vres10 = _mm256_permute4x64_epi64(vres10, _MM_SHUFFLE(3, 1, 2, 0)); - vres11 = _mm256_permute4x64_epi64(vres11, _MM_SHUFFLE(3, 1, 2, 0)); - vres20 = _mm256_permute4x64_epi64(vres20, _MM_SHUFFLE(3, 1, 2, 0)); - vres21 = _mm256_permute4x64_epi64(vres21, _MM_SHUFFLE(3, 1, 2, 0)); - vres30 = _mm256_permute4x64_epi64(vres30, _MM_SHUFFLE(3, 1, 2, 0)); - vres31 = _mm256_permute4x64_epi64(vres31, _MM_SHUFFLE(3, 1, 2, 0)); - vres40 = _mm256_permute4x64_epi64(vres40, _MM_SHUFFLE(3, 1, 2, 0)); - vres41 = _mm256_permute4x64_epi64(vres41, _MM_SHUFFLE(3, 1, 2, 0)); - vres50 = _mm256_permute4x64_epi64(vres50, _MM_SHUFFLE(3, 1, 2, 0)); - vres51 = _mm256_permute4x64_epi64(vres51, _MM_SHUFFLE(3, 1, 2, 0)); - vres60 = _mm256_permute4x64_epi64(vres60, _MM_SHUFFLE(3, 1, 2, 0)); - vres61 = _mm256_permute4x64_epi64(vres61, _MM_SHUFFLE(3, 1, 2, 0)); - - - _mm256_store_si256((__m256i*)(dst + (i * 512) + 0), vres00); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 32), vres01); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 64), vres10); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 96), vres11); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 128), vres20); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 160), vres21); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 192), vres30); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 224), vres31); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 256), vres40); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 288), vres41); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 320), vres50); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 352), vres51); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 384), vres60); - _mm256_store_si256((__m256i*)(dst + (i * 512) + 416), vres61); - - - vbefore256a = vbehind256a; - vbefore256b = vbehind256b; - vbefore256c = vbehind256c; - vbefore256d = vbehind256d; - } -} - static void mip_upsampling_w64_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; @@ -7412,7 +6699,6 @@ static void mip_predict_avx2( } } - //uvg_pixel tmp[64 * 64] = {0}; if (ups_ver_factor > 1) { switch (width) { case 4: @@ -7449,7 +6735,7 @@ static void mip_predict_avx2( mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); + mip_upsampling_w16_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); } else { mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); From 03811d442cdd9c3060ab51f2f7c18b46dd499ec9 Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 9 Sep 2024 13:48:11 +0300 Subject: [PATCH 232/237] Rename the improved pdpc functions. Get rid of the improved tag, no need for it anymore. --- src/strategies/avx2/intra-avx2.c | 152 +++++++------------------------ 1 file changed, 34 insertions(+), 118 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5ebc642d..5f4d56aa 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1949,51 +1949,6 @@ static void angular_pred_non_fractional_angle_pxl_copy_w32_wide_angle_hor_avx2(u } -static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) -{ - const int width = 4; - //ALIGNED(32) uint8_t left[4][4]; - __m128i v32s = _mm_set1_epi16(32); - - // Scale can be 0, 1 or 2 - const int offset = scale * 16; - const __m128i vweight = _mm_load_si128((const __m128i*) &intra_pdpc_w4_ver_improved_weight[offset]); - - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - const __m128i vleftshuf = _mm_setr_epi8( - 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, - 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f - ); - - __m128i vidx = _mm_setr_epi32(shifted_inv_angle_sum[0], shifted_inv_angle_sum[1], - shifted_inv_angle_sum[2], shifted_inv_angle_sum[3]); - - // For a 4 width block, height must be at least 4. Handle 4 lines at once. - for (int y = 0; y < height; y += 4) { - __m128i vdst = _mm_loadu_si128((const __m128i*)(dst + y * width)); - __m128i vleft = _mm_i32gather_epi32((const int32_t*)&ref_side[y + 1], vidx, 1); - vleft = _mm_shuffle_epi8(vleft, vleftshuf); - - __m128i vlo = _mm_unpacklo_epi8(vdst, vleft); - __m128i vhi = _mm_unpackhi_epi8(vdst, vleft); - - __m128i vmaddlo = _mm_maddubs_epi16(vlo, vweight); - __m128i vmaddhi = _mm_maddubs_epi16(vhi, vweight); - - vmaddlo = _mm_add_epi16(vmaddlo, v32s); - vmaddhi = _mm_add_epi16(vmaddhi, v32s); - - vmaddlo = _mm_srai_epi16(vmaddlo, 6); - vmaddhi = _mm_srai_epi16(vmaddhi, 6); - - __m128i packed = _mm_packus_epi16(vmaddlo, vmaddhi); - - _mm_store_si128((__m128i*)(dst + (y * width)), packed); - } -} - static void angular_pdpc_ver_w8_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { @@ -2039,44 +1994,6 @@ static void angular_pdpc_ver_w8_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* } -static void angular_pdpc_ver_w16_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) -{ - __m256i v32s = _mm256_set1_epi16(32); - const int scale = 2; // Other functions handle scales 0 and 1 - int limit = 12; // With scale 2, limit is always 12. - - const int offset = scale * 16; - const __m256i vweight = _mm256_load_si256((const __m256i*)&intra_pdpc_w16_ver_weight[offset]); - - const int inv_angle_offset = mode_disp * 64; - const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; - - for (int y = 0; y < height; ++y) { - for (int x = 0; x < limit; x += 16) { - ALIGNED(32) int16_t left[16] = {0}; - for (int xx = 0; x + xx < limit; ++xx) { - left[xx] = ref_side[y + shifted_inv_angle_sum[xx] + 1]; - } - - __m128i vdst = _mm_load_si128((const __m128i*)(dst + (y * width + x))); - __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); - __m256i vleft = _mm256_loadu_si256((__m256i*)left); - - __m256i accu = _mm256_sub_epi16(vleft, vdst16); - accu = _mm256_mullo_epi16(vweight, accu); - accu = _mm256_add_epi16(accu, v32s); - accu = _mm256_srai_epi16(accu, 6); - accu = _mm256_add_epi16(vdst16, accu); - - __m128i lo = _mm256_castsi256_si128(accu); - __m128i hi = _mm256_extracti128_si256(accu, 1); - __m128i filtered = _mm_packus_epi16(lo, hi); - - _mm_store_si128((__m128i*)(dst + (y * width + x)), filtered); - } - } -} - static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w4 function, retrofitted to work with width 16 and up when scale is 0. @@ -2132,7 +2049,7 @@ static void angular_pdpc_ver_w16_scale0_avx2(uvg_pixel* dst, const uvg_pixel* re // Mode 18 -static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +static void angular_pdpc_mode18_w4_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { const int width = 4; const int limit = MIN(3 << scale, height); @@ -2171,7 +2088,7 @@ static void angular_pdpc_mode18_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel } } -static void angular_pdpc_mode18_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +static void angular_pdpc_mode18_w8_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { const int width = 8; int limit = MIN(3 << scale, height); @@ -2243,7 +2160,7 @@ static void angular_pdpc_mode18_w16_avx2(uvg_pixel* dst, const uvg_pixel top_lef } } -static void angular_pdpc_mode18_w32_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +static void angular_pdpc_mode18_w32_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { const int width = 32; int limit = MIN(3 << scale, height); @@ -2290,7 +2207,7 @@ static void angular_pdpc_mode18_w32_improved_avx2(uvg_pixel* dst, const uvg_pixe } } -static void angular_pdpc_mode18_w64_improved_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) +static void angular_pdpc_mode18_w64_avx2(uvg_pixel* dst, const uvg_pixel top_left, const uvg_pixel* ref_side, const int height, const int scale) { const int width = 64; int limit = MIN(3 << scale, height); @@ -2368,7 +2285,7 @@ static void angular_pdpc_mode18_w64_improved_avx2(uvg_pixel* dst, const uvg_pixe // Vertical modes -static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_w4_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; //ALIGNED(32) uint8_t left[4][4]; @@ -2418,7 +2335,7 @@ static void angular_pdpc_ver_w4_high_angle_improved_avx2(uvg_pixel* dst, const u } } -static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_ver_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; __m128i v32s = _mm_set1_epi16(32); @@ -2459,7 +2376,7 @@ static void angular_pdpc_ver_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } -static void angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_4x4_scale0_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. @@ -2512,7 +2429,7 @@ static void angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(uvg_pixel* dst, } } -static void angular_pdpc_ver_4x4_scale0_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_4x4_scale0_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // This function is just the w4 function, retrofitted to work with any width when scale is 0. If width is 4, use a specialized function instead. // Since scale is 0, limit is 3 and therefore there is no meaningful work to be done when x > 3, so only the first column of 4x4 chunks is handled. @@ -2563,7 +2480,7 @@ static void angular_pdpc_ver_4x4_scale0_improved_avx2(uvg_pixel* dst, const uvg_ } -static void angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_8x4_scale1_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. @@ -2641,7 +2558,7 @@ static void angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(uvg_pixel* dst, } } -static void angular_pdpc_ver_8x4_scale1_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_8x4_scale1_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. @@ -2702,7 +2619,7 @@ static void angular_pdpc_ver_8x4_scale1_improved_avx2(uvg_pixel* dst, const uvg_ } -static void angular_pdpc_ver_w16_high_angle_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_w16_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m256i v32s = _mm256_set1_epi16(32); const int scale = 2; // Other functions handle scales 0 and 1 @@ -2799,7 +2716,7 @@ static void angular_pdpc_ver_w16_high_angle_improved_avx2(uvg_pixel* dst, const } } -static void angular_pdpc_ver_w16_scale2_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +static void angular_pdpc_ver_w16_scale2_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m128i v32s = _mm_set1_epi16(32); const int scale = 2; // Other functions handle scales 0 and 1 @@ -2844,7 +2761,7 @@ static void angular_pdpc_ver_w16_scale2_improved_avx2(uvg_pixel* dst, const uvg_ // Horizontal modes -static void angular_pdpc_hor_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_hor_w4_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 4; @@ -2888,7 +2805,7 @@ static void angular_pdpc_hor_w4_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } } -static void angular_pdpc_hor_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) +static void angular_pdpc_hor_w8_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int height, const int scale, const int mode_disp) { const int width = 8; @@ -2934,7 +2851,7 @@ static void angular_pdpc_hor_w8_improved_avx2(uvg_pixel* dst, const uvg_pixel* r } } -static void angular_pdpc_hor_w16_improved_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) +static void angular_pdpc_hor_w16_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int scale, const int mode_disp) { int limit = MIN(3 << scale, height); __m128i v32s = _mm_set1_epi16(32); @@ -3661,11 +3578,11 @@ static void uvg_angular_pred_avx2( const uvg_pixel top_left = ref_main[0]; switch (width) { - case 4: angular_pdpc_mode18_w4_improved_avx2(dst, top_left, ref_side, height, scale); break; - case 8: angular_pdpc_mode18_w8_improved_avx2(dst, top_left, ref_side, height, scale); break; + case 4: angular_pdpc_mode18_w4_avx2(dst, top_left, ref_side, height, scale); break; + case 8: angular_pdpc_mode18_w8_avx2(dst, top_left, ref_side, height, scale); break; case 16: angular_pdpc_mode18_w16_avx2(dst, top_left, ref_side, height, scale); break; - case 32: angular_pdpc_mode18_w32_improved_avx2(dst, top_left, ref_side, height, scale); break; - case 64: angular_pdpc_mode18_w64_improved_avx2(dst, top_left, ref_side, height, scale); break; + case 32: angular_pdpc_mode18_w32_avx2(dst, top_left, ref_side, height, scale); break; + case 64: angular_pdpc_mode18_w64_avx2(dst, top_left, ref_side, height, scale); break; default: assert(false && "Intra PDPC, invalid width.\n"); break; @@ -3700,22 +3617,22 @@ static void uvg_angular_pred_avx2( // Low mode disp -> high angle. For pdpc, this causes the needed references to be extremely sparse making loads without using gathers impossible. // Handle low angles with more tight reference spacing with separate functions with more optimized loads. if (mode_disp < 6) - angular_pdpc_ver_w4_high_angle_improved_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_w4_high_angle_avx2(dst, ref_side, height, scale, mode_disp); else - angular_pdpc_ver_w4_improved_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_ver_w4_avx2(dst, ref_side, height, scale, mode_disp); break; case 8: if (scale == 0) { if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_4x4_scale0_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); } else /*if (scale == 1)*/ { if (mode_disp < 8) - angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_8x4_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_avx2(dst, ref_side, width, height, mode_disp); } // This branch was never executed. There is no case where width == 8 and scale == 2 and PDPC is enabled. /*else { @@ -3731,21 +3648,21 @@ static void uvg_angular_pred_avx2( switch (scale) { case 0: if (mode_disp < 6) - angular_pdpc_ver_4x4_scale0_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_high_angle_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_4x4_scale0_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); break; case 1: if (mode_disp < 8) - angular_pdpc_ver_8x4_scale1_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_8x4_scale1_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_8x4_scale1_avx2(dst, ref_side, width, height, mode_disp); break; case 2: if (mode_disp < 14) - angular_pdpc_ver_w16_high_angle_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_w16_high_angle_avx2(dst, ref_side, width, height, mode_disp); else - angular_pdpc_ver_w16_scale2_improved_avx2(dst, ref_side, width, height, mode_disp); + angular_pdpc_ver_w16_scale2_avx2(dst, ref_side, width, height, mode_disp); break; default: assert(false && "Intra PDPC: Invalid scale.\n"); @@ -3764,12 +3681,12 @@ static void uvg_angular_pred_avx2( angular_pdpc_hor_w4_high_angle_improved_avx2(dst, ref_side, height, scale, mode_disp); else*/ // The above code was not accessed ever. There is no case where width == 4 and and mode disp < 6 for horizontal modes where PDPC is enabled. - angular_pdpc_hor_w4_improved_avx2(dst, ref_side, height, scale, mode_disp); + angular_pdpc_hor_w4_avx2(dst, ref_side, height, scale, mode_disp); break; - case 8: angular_pdpc_hor_w8_improved_avx2(dst, ref_side, height, scale, mode_disp); break; + case 8: angular_pdpc_hor_w8_avx2(dst, ref_side, height, scale, mode_disp); break; case 16: // 16 width and higher done with the same function case 32: - case 64: angular_pdpc_hor_w16_improved_avx2(dst, ref_side, width, height, scale, mode_disp); break; + case 64: angular_pdpc_hor_w16_avx2(dst, ref_side, width, height, scale, mode_disp); break; default: assert(false && "Intra PDPC: Invalid width.\n"); } @@ -6788,7 +6705,6 @@ int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) if (bitdepth == 8) { success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2); success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2); - success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2); success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2); success &= uvg_strategyselector_register(opaque, "mip_predict", "avx2", 40, &mip_predict_avx2); } From 11782722a3baf3a35b895d6380f613fa2d016e33 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 10 Sep 2024 08:49:58 +0300 Subject: [PATCH 233/237] Remove the alt tag from MIP upscale function names. --- src/strategies/avx2/intra-avx2.c | 68 ++++++-------------------------- 1 file changed, 13 insertions(+), 55 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5f4d56aa..5c78352d 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -4963,48 +4963,6 @@ static void mip_upsampling_w8_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixe }*/ } -static void mip_upsampling_w8_ups2_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) -{ - const uint8_t red_pred_size = 4; - const uint8_t ups_factor = 2; // width / red_pred_size - - const int log2_factor = uvg_g_convert_to_log2[ups_factor]; - - const uvg_pixel* ref_ptr = ref + ref_step - 1; - const uvg_pixel* src_ptr = src; - const uvg_pixel* dst_ptr = dst; - - ALIGNED(16) uint8_t before[17]; - memcpy(&before[1], src_ptr, 16); - before[0] = ref_ptr[ref_step * 0]; - before[4] = ref_ptr[ref_step * 1]; - before[8] = ref_ptr[ref_step * 2]; - before[12] = ref_ptr[ref_step * 3]; - - __m128i vbefore = _mm_load_si128((__m128i*)before); - __m128i vbehind = _mm_load_si128((__m128i*)src_ptr); - - __m128i vavg = _mm_avg_epu8(vbefore, vbehind); - - __m128i vreslo = _mm_unpacklo_epi8(vavg, vbehind); - __m128i vreshi = _mm_unpackhi_epi8(vavg, vbehind); - - // Dst step is never 8, since this is only called for 8x8 blocks - *(uint64_t*)&dst[dst_step * 0] = _mm_extract_epi64(vreslo, 0); - *(uint64_t*)&dst[dst_step * 1] = _mm_extract_epi64(vreslo, 1); - *(uint64_t*)&dst[dst_step * 2] = _mm_extract_epi64(vreshi, 0); - *(uint64_t*)&dst[dst_step * 3] = _mm_extract_epi64(vreshi, 1); - - /*if (dst_step == 8) { - _mm256_storeu_si256((__m256i*)dst, vres); - } - else { - *(uint64_t*)&dst[dst_step * 0] = _mm256_extract_epi64(vres, 0); - *(uint64_t*)&dst[dst_step * 1] = _mm256_extract_epi64(vres, 1); - *(uint64_t*)&dst[dst_step * 2] = _mm256_extract_epi64(vres, 2); - *(uint64_t*)&dst[dst_step * 3] = _mm256_extract_epi64(vres, 3); - }*/ -} static void mip_upsampling_w16_ups2_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { @@ -5117,7 +5075,7 @@ static void mip_upsampling_w16_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst_ptr + dst_step * 3), vtmp3); } -static void mip_upsampling_w32_ups4_hor_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) +static void mip_upsampling_w32_ups4_hor_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref, const uint16_t dst_step, const uint8_t ref_step) { const uint8_t red_pred_size = 8; const uint8_t ups_factor = 4; // width / red_pred_size @@ -5917,7 +5875,7 @@ static void mip_upsampling_w16_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix _mm_store_si128((__m128i*)(dst + 224), vavg7); } -static void mip_upsampling_w16_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +static void mip_upsampling_w16_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; @@ -6039,7 +5997,7 @@ static void mip_upsampling_w32_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix } } -static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +static void mip_upsampling_w32_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; @@ -6085,7 +6043,7 @@ static void mip_upsampling_w32_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg } } -static void mip_upsampling_w32_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +static void mip_upsampling_w32_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; @@ -6187,7 +6145,7 @@ static void mip_upsampling_w64_ups2_ver_avx2(uvg_pixel* const dst, const uvg_pix } } -static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +static void mip_upsampling_w64_ups4_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; @@ -6266,7 +6224,7 @@ static void mip_upsampling_w64_ups4_ver_avx2_alt(uvg_pixel* const dst, const uvg } } -static void mip_upsampling_w64_ups8_ver_avx2_alt(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) +static void mip_upsampling_w64_ups8_ver_avx2(uvg_pixel* const dst, const uvg_pixel* const src, const uvg_pixel* const ref) { const uvg_pixel* src_ptr = src; const uvg_pixel* dst_ptr = dst; @@ -6589,7 +6547,7 @@ static void mip_predict_avx2( // Case 4 does not exist. There is no need for horizontal upsampling when width is 4. case 8: // This will only get called for 8x8 blocks. - mip_upsampling_w8_ups2_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); + mip_upsampling_w8_ups2_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); break; case 16: if (red_pred_size == 4) { @@ -6604,7 +6562,7 @@ static void mip_predict_avx2( mip_upsampling_w32_ups8_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); } else { - mip_upsampling_w32_ups4_hor_avx2_alt(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. + mip_upsampling_w32_ups4_hor_avx2(hor_dst, reduced_pred, ref_samples_left, ver_src_step, ups_ver_factor); // Works for height 8, 16, 32 and 64. Upsamples 1 to 4. } break; case 64: @@ -6652,7 +6610,7 @@ static void mip_predict_avx2( mip_upsampling_w16_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w16_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); + mip_upsampling_w16_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { mip_upsampling_w16_ups8_ver_avx2(result, ver_src, ref_samples_top); @@ -6664,10 +6622,10 @@ static void mip_predict_avx2( mip_upsampling_w32_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w32_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); + mip_upsampling_w32_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - mip_upsampling_w32_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); + mip_upsampling_w32_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; @@ -6676,10 +6634,10 @@ static void mip_predict_avx2( mip_upsampling_w64_ups2_ver_avx2(result, ver_src, ref_samples_top); } else if (ups_ver_factor == 4) { - mip_upsampling_w64_ups4_ver_avx2_alt(result, ver_src, ref_samples_top); + mip_upsampling_w64_ups4_ver_avx2(result, ver_src, ref_samples_top); } else { - mip_upsampling_w64_ups8_ver_avx2_alt(result, ver_src, ref_samples_top); + mip_upsampling_w64_ups8_ver_avx2(result, ver_src, ref_samples_top); } break; From d7d679079122c33cb6daf14c3a0cfc920a537978 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 10 Sep 2024 09:26:34 +0300 Subject: [PATCH 234/237] static + NO_ASAN --- src/global.h | 6 ++++++ src/strategies/avx2/intra-avx2.c | 1 + src/strategies/avx2/intra_avx2_tables.h | 26 ++++++++++++------------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/global.h b/src/global.h index 972b7e82..f6268f04 100644 --- a/src/global.h +++ b/src/global.h @@ -313,6 +313,12 @@ typedef int32_t mv_t; #define ALIGNED(alignment) __attribute__((aligned (alignment))) #endif +#ifdef _MSC_VER +#define NO_ASAN +#else +#define NO_ASAN __attribute__((no_sanitize("address"))) +#endif + #ifdef _MSC_VER // Buggy VS2010 throws intellisense warnings if void* is not casted. #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num)) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5c78352d..09886640 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -410,6 +410,7 @@ static void angular_pred_w8_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, } } +NO_ASAN static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const int16_t* delta_int, const int16_t* delta_fract, const int width, const int height, const int8_t(*filter)[4]) { const __m256i p_shuf_01 = _mm256_setr_epi8( diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index bd63cfed..2b5507d1 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -506,7 +506,7 @@ static ALIGNED(32) const int8_t intra_chroma_linear_interpolation_shuffle_vector }; // Intra interpolation shuffle vectors for luma w4 horizontal. Includes wide angle modes [-12, 1]. Wide angle numbering goes from -12 to 1 since planar and DC (0, 1) are not considered angular modes. -ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { +static ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -12 | not used 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -792,7 +792,7 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w4_hor[] = { }; // Intra interpolation shuffle vectors for luma w8 horizontal. -ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { +static ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Mode -12 | not used 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -1079,7 +1079,7 @@ ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w8_hor[] = { // Intra interpolation shuffle vectors for luma w64 horizontal. // w16 and w32 functions can also use values in this table. -ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w64_hor[] = { +static ALIGNED(32) const int8_t intra_luma_interpolation_shuffle_vectors_w64_hor[] = { 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, 0x05, 0x06, 0x07, 0x08, // Mode 5 mrl 0 0x06, 0x07, 0x08, 0x09, 0x07, 0x08, 0x09, 0x0a, 0x07, 0x08, 0x09, 0x0a, 0x08, 0x09, 0x0a, 0x0b, 0x09, 0x0a, 0x0b, 0x0c, 0x0a, 0x0b, 0x0c, 0x0d, 0x0a, 0x0b, 0x0c, 0x0d, 0x0b, 0x0c, 0x0d, 0x0e, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07, 0x05, 0x06, 0x07, 0x08, @@ -2611,21 +2611,21 @@ static ALIGNED(32) const int16_t intra_pdpc_w16_ver_weight[] = { // Weights for improved PDPC // Weights for improved intra pdpc w4 vertical. -ALIGNED(32) const uint8_t intra_pdpc_w4_ver_improved_weight[] = { +static ALIGNED(32) const uint8_t intra_pdpc_w4_ver_improved_weight[] = { 32, 32, 56, 8, 62, 2, 64, 0, 32, 32, 56, 8, 62, 2, 64, 0, // Scale 0 32, 32, 48, 16, 56, 8, 60, 4, 32, 32, 48, 16, 56, 8, 60, 4, // Scale 1 32, 32, 32, 32, 48, 16, 48, 16, 32, 32, 32, 32, 48, 16, 48, 16, // Scale 2 }; // Weights for improved intra pdpc w8 vertical. -ALIGNED(32) const uint8_t intra_pdpc_w8_ver_improved_weight[] = { +static ALIGNED(32) const uint8_t intra_pdpc_w8_ver_improved_weight[] = { 32, 32, 56, 8, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 32, 32, 56, 8, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 0 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1, 64, 0, 64, 0, 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1, 64, 0, 64, 0, // Scale 1 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, // Scale 2 }; // Weights for improved intra pdpc w16 vertical. -ALIGNED(32) const uint8_t intra_pdpc_w16_ver_improved_weight[] = { +static ALIGNED(32) const uint8_t intra_pdpc_w16_ver_improved_weight[] = { 32, 32, 56, 8, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 0 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 1 32, 32, 32, 32, 48, 16, 48, 16, 56, 8, 56, 8, 60, 4, 60, 4, 62, 2, 62, 2, 63, 1, 63, 1, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 2 @@ -2633,7 +2633,7 @@ ALIGNED(32) const uint8_t intra_pdpc_w16_ver_improved_weight[] = { // Weights for improved intra pdpc w4 horizontal. -ALIGNED(32) const uint8_t intra_pdpc_w4_hor_improved_weight[] = { +static ALIGNED(32) const uint8_t intra_pdpc_w4_hor_improved_weight[] = { 32, 32, 32, 32, 32, 32, 32, 32, 56, 8, 56, 8, 56, 8, 56, 8, 62, 2, 62, 2, 62, 2, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, // Scale 0 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, @@ -2649,7 +2649,7 @@ ALIGNED(32) const uint8_t intra_pdpc_w4_hor_improved_weight[] = { }; // Weights for improved intra pdpc w8 horizontal. -ALIGNED(32) const uint8_t intra_pdpc_w8_hor_improved_weight[] = { +static ALIGNED(32) const uint8_t intra_pdpc_w8_hor_improved_weight[] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, 56, 8, // Scale 0 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 62, 2, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, @@ -3417,7 +3417,7 @@ static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { // Intra ref building shuffle vector tables -ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_4[] = { +static ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_4[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 1 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 2 @@ -3437,7 +3437,7 @@ ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_4[] = { 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 16 }; -ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_8[] = { +static ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_8[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 1 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 2 @@ -3457,7 +3457,7 @@ ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_8[] = { 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 16 }; -ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_16[] = { +static ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_16[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // mode disp 1 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // mode disp 2 @@ -3702,7 +3702,7 @@ ALIGNED(32) static const int16_t delta_fract_wide_angle_table[1200] = { 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 1, 4, 7, 10, 13, 16, // 1 }; -ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_32[] = { +static ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_32[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // mode disp 1 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f, // mode disp 2 @@ -3722,7 +3722,7 @@ ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_32[] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, // mode disp 16 }; -ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_64[] = { +static ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_64[] = { //<-v0----------------------------------------------------------------------------------------->||<-v1----------------------------------------------------------------------------------------->||<-v2----------------------------------------------------------------------------------------->||<-v3-----------------------------------------------------------------------------------------> 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mode disp 0 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, // mode disp 1 From ddc8b2956bccca69bb5877b6e48a1f30358ea6c3 Mon Sep 17 00:00:00 2001 From: siivonek Date: Tue, 10 Sep 2024 09:44:55 +0300 Subject: [PATCH 235/237] Change pdpc shuffle tables to uint8. Properly mark unused rows. Reduce hex field size to 8 bits, larger field size will cause problems in linux. During compilation in linux, using 0xfff with 8 bit tables causes an error. --- src/strategies/avx2/intra_avx2_tables.h | 268 ++++++++++++------------ 1 file changed, 133 insertions(+), 135 deletions(-) diff --git a/src/strategies/avx2/intra_avx2_tables.h b/src/strategies/avx2/intra_avx2_tables.h index 2b5507d1..df0ed5a3 100644 --- a/src/strategies/avx2/intra_avx2_tables.h +++ b/src/strategies/avx2/intra_avx2_tables.h @@ -3267,154 +3267,152 @@ static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_hor[] = { // Shuffle vectors for w4 vertical pdpc. -static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w4_ver[] = { - 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 0 - 0x000, 0x020, 0x040, 0x060, 0x001, 0x021, 0x041, 0x061, 0x002, 0x022, 0x042, 0x062, 0x003, 0x023, 0x043, 0x063, // Mode disp 1 - 0x000, 0x010, 0x020, 0x030, 0x001, 0x011, 0x021, 0x031, 0x002, 0x012, 0x022, 0x032, 0x003, 0x013, 0x023, 0x033, // Mode disp 2 - 0x000, 0x00a, 0x015, 0x020, 0x001, 0x00b, 0x016, 0x021, 0x002, 0x00c, 0x017, 0x022, 0x003, 0x00d, 0x018, 0x023, // Mode disp 3 - 0x000, 0x008, 0x010, 0x018, 0x001, 0x009, 0x011, 0x019, 0x002, 0x00a, 0x012, 0x01a, 0x003, 0x00b, 0x013, 0x01b, // Mode disp 4 - 0x000, 0x006, 0x00b, 0x010, 0x001, 0x007, 0x00c, 0x011, 0x002, 0x008, 0x00d, 0x012, 0x003, 0x009, 0x00e, 0x013, // Mode disp 5 - 0x000, 0x004, 0x008, 0x00c, 0x001, 0x005, 0x009, 0x00d, 0x002, 0x006, 0x00a, 0x00e, 0x003, 0x007, 0x00b, 0x00f, // Mode disp 6 - 0x000, 0x003, 0x007, 0x00a, 0x001, 0x004, 0x008, 0x00b, 0x002, 0x005, 0x009, 0x00c, 0x003, 0x006, 0x00a, 0x00d, // Mode disp 7 - 0x000, 0x002, 0x005, 0x008, 0x001, 0x003, 0x006, 0x009, 0x002, 0x004, 0x007, 0x00a, 0x003, 0x005, 0x008, 0x00b, // Mode disp 8 - 0x000, 0x003, 0x005, 0x007, 0x001, 0x004, 0x006, 0x008, 0x002, 0x005, 0x007, 0x009, 0x003, 0x006, 0x008, 0x00a, // Mode disp 9 - 0x000, 0x002, 0x004, 0x006, 0x001, 0x003, 0x005, 0x007, 0x002, 0x004, 0x006, 0x008, 0x003, 0x005, 0x007, 0x009, // Mode disp 10 - 0x000, 0x002, 0x003, 0x005, 0x001, 0x003, 0x004, 0x006, 0x002, 0x004, 0x005, 0x007, 0x003, 0x005, 0x006, 0x008, // Mode disp 11 - 0x000, 0x001, 0x003, 0x004, 0x001, 0x002, 0x004, 0x005, 0x002, 0x003, 0x005, 0x006, 0x003, 0x004, 0x006, 0x007, // Mode disp 12 - 0x000, 0x002, 0x003, 0x005, 0x001, 0x003, 0x004, 0x006, 0x002, 0x004, 0x005, 0x007, 0x003, 0x005, 0x006, 0x008, // Mode disp 13 - 0x000, 0x001, 0x003, 0x004, 0x001, 0x002, 0x004, 0x005, 0x002, 0x003, 0x005, 0x006, 0x003, 0x004, 0x006, 0x007, // Mode disp 14 - 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 15 - 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 16 - 0x000, 0x001, 0x002, 0x003, 0x001, 0x002, 0x003, 0x004, 0x002, 0x003, 0x004, 0x005, 0x003, 0x004, 0x005, 0x006, // Mode disp 17 - 0x000, 0x001, 0x001, 0x002, 0x001, 0x002, 0x002, 0x003, 0x002, 0x003, 0x003, 0x004, 0x003, 0x004, 0x004, 0x005, // Mode disp 18 - 0x000, 0x000, 0x001, 0x002, 0x001, 0x001, 0x002, 0x003, 0x002, 0x002, 0x003, 0x004, 0x003, 0x003, 0x004, 0x005, // Mode disp 19 - 0x000, 0x000, 0x001, 0x002, 0x001, 0x001, 0x002, 0x003, 0x002, 0x002, 0x003, 0x004, 0x003, 0x003, 0x004, 0x005, // Mode disp 20 - 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, // Mode disp 21 - 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, // Mode disp 22 - 0x000, 0x001, 0x001, 0x002, 0x001, 0x002, 0x002, 0x003, 0x002, 0x003, 0x003, 0x004, 0x003, 0x004, 0x004, 0x005, // Mode disp 23 - 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 24 - 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 25 - 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 26 - 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, // Mode disp 27 - 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, // Mode disp 28 - 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 29 - 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 30 - 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, // Mode disp 31 +static ALIGNED(32) const uint8_t intra_pdpc_shuffle_vectors_w4_ver[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 0 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 1 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 2 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 3 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 4 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 5 | not used + 0x00, 0x04, 0x08, 0x0c, 0x01, 0x05, 0x09, 0x0d, 0x02, 0x06, 0x0a, 0x0e, 0x03, 0x07, 0x0b, 0x0f, // Mode disp 6 + 0x00, 0x03, 0x07, 0x0a, 0x01, 0x04, 0x08, 0x0b, 0x02, 0x05, 0x09, 0x0c, 0x03, 0x06, 0x0a, 0x0d, // Mode disp 7 + 0x00, 0x02, 0x05, 0x08, 0x01, 0x03, 0x06, 0x09, 0x02, 0x04, 0x07, 0x0a, 0x03, 0x05, 0x08, 0x0b, // Mode disp 8 + 0x00, 0x03, 0x05, 0x07, 0x01, 0x04, 0x06, 0x08, 0x02, 0x05, 0x07, 0x09, 0x03, 0x06, 0x08, 0x0a, // Mode disp 9 + 0x00, 0x02, 0x04, 0x06, 0x01, 0x03, 0x05, 0x07, 0x02, 0x04, 0x06, 0x08, 0x03, 0x05, 0x07, 0x09, // Mode disp 10 + 0x00, 0x02, 0x03, 0x05, 0x01, 0x03, 0x04, 0x06, 0x02, 0x04, 0x05, 0x07, 0x03, 0x05, 0x06, 0x08, // Mode disp 11 + 0x00, 0x01, 0x03, 0x04, 0x01, 0x02, 0x04, 0x05, 0x02, 0x03, 0x05, 0x06, 0x03, 0x04, 0x06, 0x07, // Mode disp 12 + 0x00, 0x02, 0x03, 0x05, 0x01, 0x03, 0x04, 0x06, 0x02, 0x04, 0x05, 0x07, 0x03, 0x05, 0x06, 0x08, // Mode disp 13 + 0x00, 0x01, 0x03, 0x04, 0x01, 0x02, 0x04, 0x05, 0x02, 0x03, 0x05, 0x06, 0x03, 0x04, 0x06, 0x07, // Mode disp 14 + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // Mode disp 15 + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // Mode disp 16 + 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, // Mode disp 17 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode disp 18 + 0x00, 0x00, 0x01, 0x02, 0x01, 0x01, 0x02, 0x03, 0x02, 0x02, 0x03, 0x04, 0x03, 0x03, 0x04, 0x05, // Mode disp 19 + 0x00, 0x00, 0x01, 0x02, 0x01, 0x01, 0x02, 0x03, 0x02, 0x02, 0x03, 0x04, 0x03, 0x03, 0x04, 0x05, // Mode disp 20 + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, // Mode disp 21 + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, // Mode disp 22 + 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, // Mode disp 23 + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, // Mode disp 24 + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, // Mode disp 25 + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, // Mode disp 26 + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, // Mode disp 27 + 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, // Mode disp 28 + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, // Mode disp 29 + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, // Mode disp 30 + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, // Mode disp 31 }; -// Shuffle vectors for 8x2 scale 1 vertical pdpc. 0xfff entries are "don't care", those will be zeroed out by zero weights -// These are basically same as the 8x2 scale2 vectors, but with added "don't care" entries. This table can be safely removed. -static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale1_ver[] = { - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 0 - 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0xfff, 0xfff, 0x001, 0x021, 0x041, 0x061, 0x081, 0x0a1, 0xfff, 0xfff, // Mode disp 1 - 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0xfff, 0xfff, 0x001, 0x011, 0x021, 0x031, 0x041, 0x051, 0xfff, 0xfff, // Mode disp 2 - 0x000, 0x00a, 0x015, 0x020, 0x02a, 0x035, 0xfff, 0xfff, 0x001, 0x00b, 0x016, 0x021, 0x02b, 0x036, 0xfff, 0xfff, // Mode disp 3 - 0x000, 0x008, 0x010, 0x018, 0x020, 0x028, 0xfff, 0xfff, 0x001, 0x009, 0x011, 0x019, 0x021, 0x029, 0xfff, 0xfff, // Mode disp 4 - 0x000, 0x006, 0x00b, 0x010, 0x016, 0x01b, 0xfff, 0xfff, 0x001, 0x007, 0x00c, 0x011, 0x017, 0x01c, 0xfff, 0xfff, // Mode disp 5 - 0x000, 0x004, 0x008, 0x00c, 0x010, 0x014, 0xfff, 0xfff, 0x001, 0x005, 0x009, 0x00d, 0x011, 0x015, 0xfff, 0xfff, // Mode disp 6 - 0x000, 0x003, 0x007, 0x00a, 0x00d, 0x010, 0xfff, 0xfff, 0x001, 0x004, 0x008, 0x00b, 0x00e, 0x011, 0xfff, 0xfff, // Mode disp 7 - 0x000, 0x002, 0x005, 0x008, 0x00a, 0x00d, 0xfff, 0xfff, 0x001, 0x003, 0x006, 0x009, 0x00b, 0x00e, 0xfff, 0xfff, // Mode disp 8 - 0x000, 0x003, 0x005, 0x007, 0x009, 0x00c, 0xfff, 0xfff, 0x001, 0x004, 0x006, 0x008, 0x00a, 0x00d, 0xfff, 0xfff, // Mode disp 9 - 0x000, 0x002, 0x004, 0x006, 0x008, 0x00a, 0xfff, 0xfff, 0x001, 0x003, 0x005, 0x007, 0x009, 0x00b, 0xfff, 0xfff, // Mode disp 10 - 0x000, 0x002, 0x003, 0x005, 0x007, 0x009, 0xfff, 0xfff, 0x001, 0x003, 0x004, 0x006, 0x008, 0x00a, 0xfff, 0xfff, // Mode disp 11 - 0x000, 0x001, 0x003, 0x004, 0x006, 0x008, 0xfff, 0xfff, 0x001, 0x002, 0x004, 0x005, 0x007, 0x009, 0xfff, 0xfff, // Mode disp 12 - 0x000, 0x002, 0x003, 0x005, 0x006, 0x007, 0xfff, 0xfff, 0x001, 0x003, 0x004, 0x006, 0x007, 0x008, 0xfff, 0xfff, // Mode disp 13 - 0x000, 0x001, 0x003, 0x004, 0x005, 0x006, 0xfff, 0xfff, 0x001, 0x002, 0x004, 0x005, 0x006, 0x007, 0xfff, 0xfff, // Mode disp 14 - 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0xfff, 0xfff, 0x001, 0x002, 0x003, 0x004, 0x006, 0x007, 0xfff, 0xfff, // Mode disp 15 - 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0xfff, 0xfff, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0xfff, 0xfff, // Mode disp 16 - 0x000, 0x001, 0x002, 0x003, 0x004, 0x004, 0xfff, 0xfff, 0x001, 0x002, 0x003, 0x004, 0x005, 0x005, 0xfff, 0xfff, // Mode disp 17 - 0x000, 0x001, 0x001, 0x002, 0x003, 0x004, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x003, 0x004, 0x005, 0xfff, 0xfff, // Mode disp 18 - 0x000, 0x000, 0x001, 0x002, 0x003, 0x003, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x003, 0x004, 0x004, 0xfff, 0xfff, // Mode disp 19 - 0x000, 0x000, 0x001, 0x002, 0x002, 0x003, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x003, 0x003, 0x004, 0xfff, 0xfff, // Mode disp 20 - 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 21 - 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 22 - 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0xfff, 0xfff, // Mode disp 23 - 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 24 - 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0xfff, 0xfff, // Mode disp 25 - 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0xfff, 0xfff, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0xfff, 0xfff, // Mode disp 26 - 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0xfff, 0xfff, // Mode disp 27 - 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0xfff, 0xfff, // Mode disp 28 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0xfff, 0xfff, // Mode disp 29 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 30 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0xfff, 0xfff, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0xfff, 0xfff, // Mode disp 31 +// Shuffle vectors for 8x2 scale 1 vertical pdpc. +static ALIGNED(32) const uint8_t intra_pdpc_shuffle_vectors_8x2_scale1_ver[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 0 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 1 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 2 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 3 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 4 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 5 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 6 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 7 | not used + 0x00, 0x02, 0x05, 0x08, 0x0a, 0x0d, 0xff, 0xff, 0x01, 0x03, 0x06, 0x09, 0x0b, 0x0e, 0xff, 0xff, // Mode disp 8 + 0x00, 0x03, 0x05, 0x07, 0x09, 0x0c, 0xff, 0xff, 0x01, 0x04, 0x06, 0x08, 0x0a, 0x0d, 0xff, 0xff, // Mode disp 9 + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0xff, 0xff, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0xff, 0xff, // Mode disp 10 + 0x00, 0x02, 0x03, 0x05, 0x07, 0x09, 0xff, 0xff, 0x01, 0x03, 0x04, 0x06, 0x08, 0x0a, 0xff, 0xff, // Mode disp 11 + 0x00, 0x01, 0x03, 0x04, 0x06, 0x08, 0xff, 0xff, 0x01, 0x02, 0x04, 0x05, 0x07, 0x09, 0xff, 0xff, // Mode disp 12 + 0x00, 0x02, 0x03, 0x05, 0x06, 0x07, 0xff, 0xff, 0x01, 0x03, 0x04, 0x06, 0x07, 0x08, 0xff, 0xff, // Mode disp 13 + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0xff, 0xff, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0xff, 0xff, // Mode disp 14 + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0xff, 0xff, // Mode disp 15 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0xff, 0xff, // Mode disp 16 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0xff, 0xff, // Mode disp 17 + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0xff, 0xff, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0xff, 0xff, // Mode disp 18 + 0x00, 0x00, 0x01, 0x02, 0x03, 0x03, 0xff, 0xff, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0xff, 0xff, // Mode disp 19 + 0x00, 0x00, 0x01, 0x02, 0x02, 0x03, 0xff, 0xff, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0xff, 0xff, // Mode disp 20 + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0xff, 0xff, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0xff, 0xff, // Mode disp 21 + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0xff, 0xff, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0xff, 0xff, // Mode disp 22 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0xff, 0xff, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0xff, 0xff, // Mode disp 23 + 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0xff, 0xff, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0xff, 0xff, // Mode disp 24 + 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0xff, 0xff, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0xff, 0xff, // Mode disp 25 + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0xff, 0xff, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0xff, 0xff, // Mode disp 26 + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0xff, 0xff, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0xff, 0xff, // Mode disp 27 + 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0xff, 0xff, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0xff, 0xff, // Mode disp 28 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xff, 0xff, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0xff, 0xff, // Mode disp 29 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0xff, // Mode disp 30 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0xff, // Mode disp 31 }; // Shuffle vectors for 8x2 scale 2 vertical pdpc. -static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_8x2_scale2_ver[] = { - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 0 -- Unused - 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0x0c0, 0x0e0, 0x001, 0x021, 0x041, 0x061, 0x081, 0x0a1, 0x0c1, 0x0e1, // Mode disp 1 * - 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x001, 0x011, 0x021, 0x031, 0x041, 0x051, 0x061, 0x071, // Mode disp 2 * - 0x000, 0x00a, 0x015, 0x020, 0x02a, 0x035, 0x040, 0x04a, 0x001, 0x00b, 0x016, 0x021, 0x02b, 0x036, 0x041, 0x04b, // Mode disp 3 * - 0x000, 0x008, 0x010, 0x018, 0x020, 0x028, 0x030, 0x038, 0x001, 0x009, 0x011, 0x019, 0x021, 0x029, 0x031, 0x039, // Mode disp 4 * - 0x000, 0x006, 0x00b, 0x010, 0x016, 0x01b, 0x020, 0x026, 0x001, 0x007, 0x00c, 0x011, 0x017, 0x01c, 0x021, 0x027, // Mode disp 5 * - 0x000, 0x004, 0x008, 0x00c, 0x010, 0x014, 0x018, 0x01c, 0x001, 0x005, 0x009, 0x00d, 0x011, 0x015, 0x019, 0x01d, // Mode disp 6 * - 0x000, 0x003, 0x007, 0x00a, 0x00d, 0x010, 0x013, 0x017, 0x001, 0x004, 0x008, 0x00b, 0x00e, 0x011, 0x014, 0x018, // Mode disp 7 * - 0x000, 0x002, 0x005, 0x008, 0x00a, 0x00d, 0x010, 0x012, 0x001, 0x003, 0x006, 0x009, 0x00b, 0x00e, 0x011, 0x013, // Mode disp 8 * - 0x000, 0x003, 0x005, 0x007, 0x009, 0x00c, 0x00e, 0x010, 0x001, 0x004, 0x006, 0x008, 0x00a, 0x00d, 0x00f, 0x011, // Mode disp 9 -- Unused - 0x000, 0x002, 0x004, 0x006, 0x008, 0x00a, 0x00c, 0x00e, 0x001, 0x003, 0x005, 0x007, 0x009, 0x00b, 0x00d, 0x00f, // Mode disp 10 - 0x000, 0x002, 0x003, 0x005, 0x007, 0x009, 0x00a, 0x00c, 0x001, 0x003, 0x004, 0x006, 0x008, 0x00a, 0x00b, 0x00d, // Mode disp 11 - 0x000, 0x001, 0x003, 0x004, 0x006, 0x008, 0x009, 0x00b, 0x001, 0x002, 0x004, 0x005, 0x007, 0x009, 0x00a, 0x00c, // Mode disp 12 - 0x000, 0x002, 0x003, 0x005, 0x006, 0x007, 0x009, 0x00a, 0x001, 0x003, 0x004, 0x006, 0x007, 0x008, 0x00a, 0x00b, // Mode disp 13 - 0x000, 0x001, 0x003, 0x004, 0x005, 0x006, 0x008, 0x009, 0x001, 0x002, 0x004, 0x005, 0x006, 0x007, 0x009, 0x00a, // Mode disp 14 - 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x001, 0x002, 0x003, 0x004, 0x006, 0x007, 0x008, 0x009, // Mode disp 15 - 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, // Mode disp 16 - 0x000, 0x001, 0x002, 0x003, 0x004, 0x004, 0x005, 0x006, 0x001, 0x002, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, // Mode disp 17 - 0x000, 0x001, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x001, 0x002, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, // Mode disp 18 - 0x000, 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x001, 0x001, 0x002, 0x003, 0x004, 0x004, 0x005, 0x006, // Mode disp 19 - 0x000, 0x000, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x001, 0x001, 0x002, 0x003, 0x003, 0x004, 0x004, 0x005, // Mode disp 20 - 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, // Mode disp 21 - 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, // Mode disp 22 - 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, 0x005, // Mode disp 23 - 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, // Mode disp 24 - 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x003, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x004, // Mode disp 25 - 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, // Mode disp 26 - 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, 0x003, // Mode disp 27 - 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, // Mode disp 28 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, // Mode disp 29 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, // Mode disp 30 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 31 +static ALIGNED(32) const uint8_t intra_pdpc_shuffle_vectors_8x2_scale2_ver[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 0 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 1 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 2 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 3 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 4 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 5 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 6 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 7 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 8 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 9 | not used + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f, // Mode disp 10 + 0x00, 0x02, 0x03, 0x05, 0x07, 0x09, 0x0a, 0x0c, 0x01, 0x03, 0x04, 0x06, 0x08, 0x0a, 0x0b, 0x0d, // Mode disp 11 + 0x00, 0x01, 0x03, 0x04, 0x06, 0x08, 0x09, 0x0b, 0x01, 0x02, 0x04, 0x05, 0x07, 0x09, 0x0a, 0x0c, // Mode disp 12 + 0x00, 0x02, 0x03, 0x05, 0x06, 0x07, 0x09, 0x0a, 0x01, 0x03, 0x04, 0x06, 0x07, 0x08, 0x0a, 0x0b, // Mode disp 13 + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0a, // Mode disp 14 + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x06, 0x07, 0x08, 0x09, // Mode disp 15 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, // Mode disp 16 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, // Mode disp 17 + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x01, 0x02, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // Mode disp 18 + 0x00, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x01, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, // Mode disp 19 + 0x00, 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // Mode disp 20 + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, // Mode disp 21 + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, // Mode disp 22 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, // Mode disp 23 + 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, // Mode disp 24 + 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x04, // Mode disp 25 + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, // Mode disp 26 + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, // Mode disp 27 + 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, // Mode disp 28 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, // Mode disp 29 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, // Mode disp 30 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // Mode disp 31 }; // Shuffle vectors for w16 scale 2 vertical pdpc. -static ALIGNED(32) const int8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, // Mode disp 0 -- Unused - 0x000, 0x020, 0x040, 0x060, 0x080, 0x0a0, 0x0c0, 0x0e0, 0x100, 0x120, 0x140, 0x160, 0x180, 0x1a0, 0x1c0, 0x1e0, // Mode disp 1 * - 0x000, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x080, 0x090, 0x0a0, 0x0b0, 0x0c0, 0x0d0, 0x0e0, 0x0f0, // Mode disp 2 * - 0x000, 0x00a, 0x015, 0x020, 0x02a, 0x035, 0x040, 0x04a, 0x055, 0x060, 0x06a, 0x075, 0x080, 0x08a, 0x095, 0x0a0, // Mode disp 3 * - 0x000, 0x008, 0x010, 0x018, 0x020, 0x028, 0x030, 0x038, 0x040, 0x048, 0x050, 0x058, 0x060, 0x068, 0x070, 0x078, // Mode disp 4 * - 0x000, 0x006, 0x00b, 0x010, 0x016, 0x01b, 0x020, 0x026, 0x02b, 0x030, 0x036, 0x03b, 0x040, 0x046, 0x04b, 0x050, // Mode disp 5 * - 0x000, 0x004, 0x008, 0x00c, 0x010, 0x014, 0x018, 0x01c, 0x020, 0x024, 0x028, 0x02c, 0x030, 0x034, 0x038, 0x03c, // Mode disp 6 * - 0x000, 0x003, 0x007, 0x00a, 0x00d, 0x010, 0x013, 0x017, 0x01a, 0x01d, 0x020, 0x023, 0x027, 0x02a, 0x02d, 0x030, // Mode disp 7 * - 0x000, 0x002, 0x005, 0x008, 0x00a, 0x00d, 0x010, 0x012, 0x015, 0x018, 0x01a, 0x01d, 0x020, 0x022, 0x025, 0x028, // Mode disp 8 * - 0x000, 0x003, 0x005, 0x007, 0x009, 0x00c, 0x00e, 0x010, 0x013, 0x015, 0x017, 0x019, 0x01c, 0x01e, 0x020, 0x023, // Mode disp 9 * - 0x000, 0x002, 0x004, 0x006, 0x008, 0x00a, 0x00c, 0x00e, 0x010, 0x012, 0x014, 0x016, 0x018, 0x01a, 0x01c, 0x01e, // Mode disp 10 * - 0x000, 0x002, 0x003, 0x005, 0x007, 0x009, 0x00a, 0x00c, 0x00e, 0x010, 0x012, 0x013, 0x015, 0x017, 0x019, 0x01a, // Mode disp 11 * - 0x000, 0x001, 0x003, 0x004, 0x006, 0x008, 0x009, 0x00b, 0x00c, 0x00e, 0x010, 0x011, 0x013, 0x014, 0x016, 0x018, // Mode disp 12 * - 0x000, 0x002, 0x003, 0x005, 0x006, 0x007, 0x009, 0x00a, 0x00c, 0x00d, 0x00e, 0x010, 0x011, 0x012, 0x014, 0x015, // Mode disp 13 * - 0x000, 0x001, 0x003, 0x004, 0x005, 0x006, 0x008, 0x009, 0x00a, 0x00b, 0x00d, 0x00e, 0x00f, 0x010, 0x011, 0x013, // Mode disp 14 * - 0x000, 0x001, 0x002, 0x003, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x010, 0x011, // Mode disp 15 -- Unused - 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, // Mode disp 16 - 0x000, 0x001, 0x002, 0x003, 0x004, 0x004, 0x005, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, // Mode disp 17 - 0x000, 0x001, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x006, 0x007, 0x008, 0x009, 0x00a, 0x00a, 0x00b, 0x00c, // Mode disp 18 - 0x000, 0x000, 0x001, 0x002, 0x003, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x008, 0x008, 0x009, 0x00a, 0x00a, // Mode disp 19 - 0x000, 0x000, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x005, 0x005, 0x006, 0x007, 0x007, 0x008, 0x008, 0x009, // Mode disp 20 - 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x005, 0x005, 0x006, 0x006, 0x007, 0x007, 0x008, // Mode disp 21 - 0x000, 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, 0x005, 0x005, 0x006, 0x006, 0x007, 0x007, // Mode disp 22 - 0x000, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x004, 0x004, 0x004, 0x005, 0x005, 0x006, 0x006, 0x007, 0x007, // Mode disp 23 - 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, 0x005, 0x005, 0x006, 0x006, // Mode disp 24 - 0x000, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, 0x005, 0x005, // Mode disp 25 - 0x000, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, 0x003, 0x004, 0x004, 0x004, // Mode disp 26 - 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, 0x002, 0x003, 0x003, 0x003, // Mode disp 27 - 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, 0x002, 0x002, 0x002, 0x002, // Mode disp 28 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x002, // Mode disp 29 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, 0x001, // Mode disp 30 - 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x001, // Mode disp 31 +static ALIGNED(32) const uint8_t intra_pdpc_shuffle_vectors_w16_scale2_ver[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 0 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 1 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 2 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 3 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 4 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 5 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 6 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 7 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 8 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 9 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 10 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 11 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 12 | not used + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // Mode disp 13 | not used + 0x00, 0x01, 0x03, 0x04, 0x05, 0x06, 0x08, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0xff, 0xff, 0xff, 0xff, // Mode disp 14 + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0xff, 0xff, 0xff, 0xff, // Mode disp 15 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, // Mode disp 16 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0xff, 0xff, 0xff, 0xff, // Mode disp 17 + 0x00, 0x01, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, // Mode disp 18 + 0x00, 0x00, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0x08, 0xff, 0xff, 0xff, 0xff, // Mode disp 19 + 0x00, 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x05, 0x05, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff, // Mode disp 20 + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x05, 0x05, 0x06, 0xff, 0xff, 0xff, 0xff, // Mode disp 21 + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0xff, 0xff, 0xff, 0xff, // Mode disp 22 + 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x04, 0x05, 0x05, 0xff, 0xff, 0xff, 0xff, // Mode disp 23 + 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0xff, 0xff, 0xff, 0xff, // Mode disp 24 + 0x00, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0xff, 0xff, 0xff, 0xff, // Mode disp 25 + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0xff, 0xff, 0xff, 0xff, // Mode disp 26 + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0xff, 0xff, 0xff, 0xff, // Mode disp 27 + 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0xff, 0xff, 0xff, 0xff, // Mode disp 28 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0xff, 0xff, 0xff, // Mode disp 29 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0xff, 0xff, 0xff, // Mode disp 30 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, // Mode disp 31 }; - // Intra ref building shuffle vector tables static ALIGNED(16) const uint8_t intra_refbuild_shuffle_vectors_sidesize_4[] = { From ee0bfedd578ff21db315e450c5a33b66475e2189 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 10 Sep 2024 10:49:14 +0300 Subject: [PATCH 236/237] [avx2] change unaligned reads to unaligned --- src/strategies/avx2/intra-avx2.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 09886640..59b27be4 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -446,6 +446,8 @@ static void angular_pred_w16_ver_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, for (int y = 0; y < height; ++y) { // Load and shuffle filter weights + // This load can read beyond the end of the filter table, however the values + // are not used in the shuffle operation. __m128i vweights = _mm_loadu_si128((__m128i*)&filter[delta_fract[y]]); __m256i vw256 = _mm256_inserti128_si256(_mm256_castsi128_si256(vweights), vweights, 1); @@ -578,7 +580,7 @@ static void angular_pred_w4_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, int ref_offset = MIN(delta_int[0], delta_int[3]); // Copy the filter to local memory - __m128i vdfract = _mm_load_si128((__m128i*)delta_fract); + __m128i vdfract = _mm_loadu_si128((__m128i*)delta_fract); __m128i vidx = _mm_cvtepi16_epi32(vdfract); __m128i all_weights = _mm_i32gather_epi32((const int32_t*)filter, vidx, 4); @@ -670,7 +672,7 @@ static void angular_pred_w8_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, const __m256i v32s = _mm256_set1_epi16(32); // Load weights - __m128i tmp = _mm_load_si128((__m128i*)delta_fract); + __m128i tmp = _mm_loadu_si128((__m128i*)delta_fract); __m256i vidxw = _mm256_cvtepi16_epi32(tmp); __m256i vweights = _mm256_i32gather_epi32((const int32_t*)filter, vidxw, 4); @@ -728,8 +730,8 @@ static void angular_pred_w16_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel } for (int x = 0, vi = 0; x < width; x += 16, ++vi) { - __m128i tmp0 = _mm_load_si128((__m128i*)&delta_int[x]); - __m128i tmp1 = _mm_load_si128((__m128i*)&delta_int[x + 8]); + __m128i tmp0 = _mm_loadu_si128((__m128i*)&delta_int[x]); + __m128i tmp1 = _mm_loadu_si128((__m128i*)&delta_int[x + 8]); __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); @@ -814,8 +816,8 @@ static void angular_pred_w32_hor_avx2(uvg_pixel* dst, const uvg_pixel* ref_main, for (int x = 0, shuf = table_offset; x < width; x += 16, shuf += 64) { const int ref_offset = MIN(delta_int[x], delta_int[x + 15]); - __m128i tmp0 = _mm_load_si128((__m128i*)&delta_fract[x]); - __m128i tmp1 = _mm_load_si128((__m128i*)&delta_fract[x + 8]); + __m128i tmp0 = _mm_loadu_si128((__m128i*)&delta_fract[x]); + __m128i tmp1 = _mm_loadu_si128((__m128i*)&delta_fract[x + 8]); __m256i vidx0 = _mm256_cvtepi16_epi32(tmp0); __m256i vidx1 = _mm256_cvtepi16_epi32(tmp1); From ae646ec3556b20f80f9afa2793a9dc3aba8e42fe Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 16 Sep 2024 09:53:03 +0300 Subject: [PATCH 237/237] [intra] unaligned --- src/strategies/avx2/intra-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 59b27be4..24acca9e 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -525,7 +525,7 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* for (int y = 0; y < height; y += 4) { // This solution assumes the delta int values to be 64-bit // Cast from 16-bit to 64-bit. - __m128i vidx = _mm_load_si128((__m128i*)delta_int); + __m128i vidx = _mm_loadu_si128((__m128i*)delta_int); __m256i vidx256 = _mm256_cvtepu16_epi64(vidx); __m256i vp = _mm256_i64gather_epi64((const long long int*)&ref_main[y], vidx256, 1);