From a30b3d28dc069610a4dd9bcf752d1459b3006635 Mon Sep 17 00:00:00 2001 From: DRC Date: Fri, 26 Sep 2025 10:09:01 -0400 Subject: [PATCH] MMI: Format comments consistently with MMX code --- simd/mips64/jccolext-mmi.c | 4 +- simd/mips64/jccolor-mmi.c | 18 ++-- simd/mips64/jcgray-mmi.c | 10 +- simd/mips64/jcgryext-mmi.c | 4 +- simd/mips64/jcsample-mmi.c | 4 +- simd/mips64/jdcolext-mmi.c | 130 ++++++++++++++------------ simd/mips64/jdmrgext-mmi.c | 183 ++++++++++++++++++++----------------- simd/mips64/jfdctfst-mmi.c | 112 +++++++++++------------ simd/mips64/jfdctint-mmi.c | 136 +++++++++++++-------------- simd/mips64/jidctfst-mmi.c | 107 +++++++++++----------- simd/mips64/jidctint-mmi.c | 121 ++++++++++++------------ simd/mips64/jquanti-mmi.c | 9 +- 12 files changed, 438 insertions(+), 400 deletions(-) diff --git a/simd/mips64/jccolext-mmi.c b/simd/mips64/jccolext-mmi.c index 558eb2ab..09913560 100644 --- a/simd/mips64/jccolext-mmi.c +++ b/simd/mips64/jccolext-mmi.c @@ -319,8 +319,8 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, #endif - /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) - * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) + /* re = (R0 R2 R4 R6), ge = (G0 G2 G4 G6), be = (B0 B2 B4 B6) + * ro = (R1 R3 R5 R7), go = (G1 G3 G5 G7), bo = (B1 B3 B5 B7) * * (Original) * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B diff --git a/simd/mips64/jccolor-mmi.c b/simd/mips64/jccolor-mmi.c index 93ef5c79..2f50e796 100644 --- a/simd/mips64/jccolor-mmi.c +++ b/simd/mips64/jccolor-mmi.c @@ -30,15 +30,15 @@ #include "jsimd_mmi.h" -#define F_0_081 ((short)5329) /* FIX(0.08131) */ -#define F_0_114 ((short)7471) /* FIX(0.11400) */ -#define F_0_168 ((short)11059) /* FIX(0.16874) */ -#define F_0_250 ((short)16384) /* FIX(0.25000) */ -#define F_0_299 ((short)19595) /* FIX(0.29900) */ -#define F_0_331 ((short)21709) /* FIX(0.33126) */ -#define F_0_418 ((short)27439) /* FIX(0.41869) */ -#define F_0_587 ((short)38470) /* FIX(0.58700) */ -#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ +#define F_0_081 ((short)5329) /* FIX(0.08131) */ +#define F_0_114 ((short)7471) /* FIX(0.11400) */ +#define F_0_168 ((short)11059) /* FIX(0.16874) */ +#define F_0_250 ((short)16384) /* FIX(0.25000) */ +#define F_0_299 ((short)19595) /* FIX(0.29900) */ +#define F_0_331 ((short)21709) /* FIX(0.33126) */ +#define F_0_418 ((short)27439) /* FIX(0.41869) */ +#define F_0_587 ((short)38470) /* FIX(0.58700) */ +#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ enum const_index { index_PD_ONEHALF, diff --git a/simd/mips64/jcgray-mmi.c b/simd/mips64/jcgray-mmi.c index 9c7b833f..c47c368c 100644 --- a/simd/mips64/jcgray-mmi.c +++ b/simd/mips64/jcgray-mmi.c @@ -28,11 +28,11 @@ #include "jsimd_mmi.h" -#define F_0_114 ((short)7471) /* FIX(0.11400) */ -#define F_0_250 ((short)16384) /* FIX(0.25000) */ -#define F_0_299 ((short)19595) /* FIX(0.29900) */ -#define F_0_587 ((short)38470) /* FIX(0.58700) */ -#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ +#define F_0_114 ((short)7471) /* FIX(0.11400) */ +#define F_0_250 ((short)16384) /* FIX(0.25000) */ +#define F_0_299 ((short)19595) /* FIX(0.29900) */ +#define F_0_587 ((short)38470) /* FIX(0.58700) */ +#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ enum const_index { index_PD_ONEHALF, diff --git a/simd/mips64/jcgryext-mmi.c b/simd/mips64/jcgryext-mmi.c index 08a83d66..d25fd827 100644 --- a/simd/mips64/jcgryext-mmi.c +++ b/simd/mips64/jcgryext-mmi.c @@ -310,8 +310,8 @@ void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, #endif - /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) - * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) + /* re = (R0 R2 R4 R6), ge = (G0 G2 G4 G6), be = (B0 B2 B4 B6) + * ro = (R1 R3 R5 R7), go = (G1 G3 G5 G7), bo = (B1 B3 B5 B7) * * (Original) * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B diff --git a/simd/mips64/jcsample-mmi.c b/simd/mips64/jcsample-mmi.c index 0354dac0..dbf2d67a 100644 --- a/simd/mips64/jcsample-mmi.c +++ b/simd/mips64/jcsample-mmi.c @@ -50,9 +50,9 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor, output_cols * 2); bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */ - /* bias={1, 2, 1, 2} (16-bit) */ + /* bias = { 1, 2, 1, 2 } (16-bit) */ mask = _mm_cmpeq_pi16(mask, mask); - mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ + mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */ for (inrow = 0, outrow = 0; outrow < v_samp_factor; inrow += 2, outrow++) { diff --git a/simd/mips64/jdcolext-mmi.c b/simd/mips64/jdcolext-mmi.c index 3b5b2f20..a537e65c 100644 --- a/simd/mips64/jdcolext-mmi.c +++ b/simd/mips64/jdcolext-mmi.c @@ -2,7 +2,7 @@ * Loongson MMI optimizations for libjpeg-turbo * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015, 2019, 2025, D. R. Commander. All Rights Reserved. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. * All Rights Reserved. * Authors: ZhuChen @@ -116,8 +116,8 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, mask = decenter = 0.0; mask = _mm_cmpeq_pi16(mask, mask); decenter = _mm_cmpeq_pi16(decenter, decenter); - mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ - decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */ + decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */ cbe = _mm_and_si64(mask, cb); /* Cb(0246) */ cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */ @@ -139,15 +139,15 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, * B = Y - 0.22800 * Cb + Cb + Cb */ - cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */ - cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */ - cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */ - cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */ + cbe2 = _mm_add_pi16(cbe, cbe); /* 2 * CbE */ + cbo2 = _mm_add_pi16(cbo, cbo); /* 2 * CbO */ + cre2 = _mm_add_pi16(cre, cre); /* 2 * CrE */ + cro2 = _mm_add_pi16(cro, cro); /* 2 * CrO */ - be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */ - bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */ - re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */ - ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */ + be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2 * CbE * -FIX(0.22800) */ + bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2 * CbO * -FIX(0.22800) */ + re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2 * CrE * FIX(0.40200)) */ + ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2 * CrO * FIX(0.40200)) */ be = _mm_add_pi16(be, PW_ONE); bo = _mm_add_pi16(bo, PW_ONE); @@ -160,10 +160,10 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, be = _mm_add_pi16(be, cbe); bo = _mm_add_pi16(bo, cbo); - be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */ - bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */ - re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */ - ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */ + be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200)) = (B - Y)E */ + bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200)) = (B - Y)O */ + re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200)) = (R - Y)E */ + ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200)) = (R - Y)O */ gle = _mm_unpacklo_pi16(cbe, cre); ghe = _mm_unpackhi_pi16(cbe, cre); @@ -183,54 +183,64 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, glo = _mm_srai_pi32(glo, SCALEBITS); gho = _mm_srai_pi32(gho, SCALEBITS); - ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */ - go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */ - ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ - go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ + ge = _mm_packs_pi32(gle, ghe); /* CbE * -FIX(0.344) + CrE * FIX(0.285) */ + go = _mm_packs_pi32(glo, gho); /* CbO * -FIX(0.344) + CrO * FIX(0.285) */ + ge = _mm_sub_pi16(ge, cre); /* CbE * -FIX(0.344) + CrE * -FIX(0.714) = (G - Y)E */ + go = _mm_sub_pi16(go, cro); /* CbO * -FIX(0.344) + CrO * -FIX(0.714) = (G - Y)O */ ye = _mm_and_si64(mask, y); /* Y(0246) */ yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */ - re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */ - ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */ + re = _mm_add_pi16(re, ye); /* ((R - Y)E + YE) = (R0 R2 R4 R6) */ + ro = _mm_add_pi16(ro, yo); /* ((R - Y)O + YO) = (R1 R3 R5 R7) */ re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */ ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */ - ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */ - go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */ + ge = _mm_add_pi16(ge, ye); /* ((G - Y)E + YE) = (G0 G2 G4 G6) */ + go = _mm_add_pi16(go, yo); /* ((G - Y)O + YO) = (G1 G3 G5 G7) */ ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */ go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */ - be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */ - bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */ + be = _mm_add_pi16(be, ye); /* (YE + (B - Y)E) = (B0 B2 B4 B6) */ + bo = _mm_add_pi16(bo, yo); /* (YO + (B - Y)O) = (B1 B3 B5 B7) */ be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */ bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */ #if RGB_PIXELSIZE == 3 - /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ - /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ - mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ - mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ - mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ + /* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the + * mapping of components A, B, and C to red, green, and blue. + * + * mmA = (A0 A2 A4 A6 ** ** ** **) = AE + * mmB = (A1 A3 A5 A7 ** ** ** **) = AO + * mmC = (B0 B2 B4 B6 ** ** ** **) = BE + * mmD = (B1 B3 B5 B7 ** ** ** **) = BO + * mmE = (C0 C2 C4 C6 ** ** ** **) = CE + * mmF = (C1 C3 C5 C7 ** ** ** **) = CO + * mmG = (** ** ** ** ** ** ** **) + * mmH = (** ** ** ** ** ** ** **) + */ + mmA = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */ + mmE = _mm_unpacklo_pi8(mmE, mmB); /* (C0 A1 C2 A3 C4 A5 C6 A7) */ + mmD = _mm_unpacklo_pi8(mmD, mmF); /* (B1 C1 B3 C3 B5 C5 B7 C7) */ mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT); - mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */ - mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */ + mmG = _mm_unpackhi_pi16(mmA, mmE); /* (A4 B4 C4 A5 A6 B6 C6 A7) */ + mmA = _mm_unpacklo_pi16(mmA, mmE); /* (A0 B0 C0 A1 A2 B2 C2 A3) */ mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); - mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */ + mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (B3 C3 B5 C5 B7 C7 -- --) */ - mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */ - mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */ + mmC = _mm_unpackhi_pi16(mmD, mmH); /* (B5 C5 A6 B6 B7 C7 -- --) */ + mmD = _mm_unpacklo_pi16(mmD, mmH); /* (B1 C1 A2 B2 B3 C3 A4 B4) */ - mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */ - mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */ + mmF = _mm_unpackhi_pi16(mmE, mmB); /* (C6 A7 B7 C7 -- -- -- --) */ + mmE = _mm_unpacklo_pi16(mmE, mmB); /* (C2 A3 B3 C3 C4 A5 B5 C5) */ - mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */ - mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */ - mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */ + mmA = _mm_unpacklo_pi32(mmA, mmD); /* (A0 B0 C0 A1 B1 C1 A2 B2) */ + mmE = _mm_unpacklo_pi32(mmE, mmG); /* (C2 A3 B3 C3 A4 B4 C4 A5) */ + mmC = _mm_unpacklo_pi32(mmC, mmF); /* (B5 C5 A6 B6 C6 A7 B7 C7) */ if (num_cols >= 8) { if (!(((long)outptr) & 7)) { @@ -320,25 +330,33 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, xe = _mm_xor_si64(xe, xe); xo = _mm_xor_si64(xo, xo); #endif - /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ - /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ - /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */ - /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */ - mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ - mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ - mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ - mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ + /* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the + * mapping of components A, B, C, and D to red, green, and blue. + * + * mmA = (A0 A2 A4 A6 ** ** ** **) = AE + * mmB = (A1 A3 A5 A7 ** ** ** **) = AO + * mmC = (B0 B2 B4 B6 ** ** ** **) = BE + * mmD = (B1 B3 B5 B7 ** ** ** **) = BO + * mmE = (C0 C2 C4 C6 ** ** ** **) = CE + * mmF = (C1 C3 C5 C7 ** ** ** **) = CO + * mmG = (D0 D2 D4 D6 ** ** ** **) = DE + * mmH = (D1 D3 D5 D7 ** ** ** **) = DO + */ + mmA = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */ + mmE = _mm_unpacklo_pi8(mmE, mmG); /* (C0 D0 C2 D2 C4 D4 C6 D6) */ + mmB = _mm_unpacklo_pi8(mmB, mmD); /* (A1 B1 A3 B3 A5 B5 A7 B7) */ + mmF = _mm_unpacklo_pi8(mmF, mmH); /* (C1 D1 C3 D3 C5 D5 C7 D7) */ - mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */ - mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */ - mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */ - mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */ + mmC = _mm_unpackhi_pi16(mmA, mmE); /* (A4 B4 C4 D4 A6 B6 C6 D6) */ + mmA = _mm_unpacklo_pi16(mmA, mmE); /* (A0 B0 C0 D0 A2 B2 C2 D2) */ + mmG = _mm_unpackhi_pi16(mmB, mmF); /* (A5 B5 C5 D5 A7 B7 C7 D7) */ + mmB = _mm_unpacklo_pi16(mmB, mmF); /* (A1 B1 C1 D1 A3 B3 C3 D3) */ - mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */ - mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */ - mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */ - mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */ + mmD = _mm_unpackhi_pi32(mmA, mmB); /* (A2 B2 C2 D2 A3 B3 C3 D3) */ + mmA = _mm_unpacklo_pi32(mmA, mmB); /* (A0 B0 C0 D0 A1 B1 C1 D1) */ + mmH = _mm_unpackhi_pi32(mmC, mmG); /* (A6 B6 C6 D6 A7 B7 C7 D7) */ + mmC = _mm_unpacklo_pi32(mmC, mmG); /* (A4 B4 C4 D4 A5 B5 C5 D5) */ if (num_cols >= 8) { if (!(((long)outptr) & 7)) { diff --git a/simd/mips64/jdmrgext-mmi.c b/simd/mips64/jdmrgext-mmi.c index be09ff2a..dc9e6439 100644 --- a/simd/mips64/jdmrgext-mmi.c +++ b/simd/mips64/jdmrgext-mmi.c @@ -2,7 +2,7 @@ * Loongson MMI optimizations for libjpeg-turbo * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015, 2019, 2025, D. R. Commander. All Rights Reserved. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. * All Rights Reserved. * Authors: ZhangLixia @@ -120,8 +120,8 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, mask = decenter = 0.0; mask = _mm_cmpeq_pi16(mask, mask); decenter = _mm_cmpeq_pi16(decenter, decenter); - mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ - decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + mask = _mm_srli_pi16(mask, BYTE_BIT); /* { 0xFF 0x00 0xFF 0x00 .. } */ + decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */ cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */ cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */ @@ -143,15 +143,15 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, * B = Y - 0.22800 * Cb + Cb + Cb */ - cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */ - cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */ - crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */ - crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */ + cbl2 = _mm_add_pi16(cbl, cbl); /* 2 * CbL */ + cbh2 = _mm_add_pi16(cbh, cbh); /* 2 * CbH */ + crl2 = _mm_add_pi16(crl, crl); /* 2 * CrL */ + crh2 = _mm_add_pi16(crh, crh); /* 2 * CrH */ - bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */ - bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */ - rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */ - rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */ + bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2 * CbL * -FIX(0.22800) */ + bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2 * CbH * -FIX(0.22800) */ + rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2 * CrL * FIX(0.40200)) */ + rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2 * CrH * FIX(0.40200)) */ bl = _mm_add_pi16(bl, PW_ONE); bh = _mm_add_pi16(bh, PW_ONE); @@ -164,10 +164,10 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, bl = _mm_add_pi16(bl, cbl); bh = _mm_add_pi16(bh, cbh); - bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */ - bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */ - rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */ - rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */ + bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200)) = (B - Y)L */ + bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200)) = (B - Y)H */ + rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200)) = (R - Y)L */ + rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200)) = (R - Y)H */ ga = _mm_unpacklo_pi16(cbl, crl); gb = _mm_unpackhi_pi16(cbl, crl); @@ -187,10 +187,10 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, gc = _mm_srai_pi32(gc, SCALEBITS); gd = _mm_srai_pi32(gd, SCALEBITS); - gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */ - gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */ - gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */ - gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */ + gl = _mm_packs_pi32(ga, gb); /* CbL * -FIX(0.344) + CrL * FIX(0.285) */ + gh = _mm_packs_pi32(gc, gd); /* CbH * -FIX(0.344) + CrH * FIX(0.285) */ + gl = _mm_sub_pi16(gl, crl); /* CbL * -FIX(0.344) + CrL * -FIX(0.714) = (G - Y)L */ + gh = _mm_sub_pi16(gh, crh); /* CbH * -FIX(0.344) + CrH * -FIX(0.714) = (G - Y)H */ ythise = _mm_and_si64(mask, ythis); /* Y(0246) */ ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */ @@ -220,38 +220,47 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, #if RGB_PIXELSIZE == 3 - /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */ - /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */ - /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */ - mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ - mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */ - mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ - mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */ - mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ - mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */ + /* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the + * mapping of components A, B, and C to red, green, and blue. + * + * mmA = (A0 A2 A4 A6 A8 Aa Ac Ae) = AE + * mmB = (A1 A3 A5 A7 A9 Ab Ad Af) = AO + * mmC = (B0 B2 B4 B6 B8 Ba Bc Be) = BE + * mmD = (B1 B3 B5 B7 B9 Bb Bd Bf) = BO + * mmE = (C0 C2 C4 C6 C8 Ca Cc Ce) = CE + * mmF = (C1 C3 C5 C7 C9 Cb Cd Cf) = CO + * mmG = (** ** ** ** ** ** ** **) + * mmH = (** ** ** ** ** ** ** **) + */ + mmG = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */ + mmA = _mm_unpackhi_pi8(mmA, mmC); /* (A8 B8 Aa Ba Ac Bc Ae Be) */ + mmH = _mm_unpacklo_pi8(mmE, mmB); /* (C0 A1 C2 A3 C4 A5 C6 A7) */ + mmE = _mm_unpackhi_pi8(mmE, mmB); /* (C8 A9 Ca Ab Cc Ad Ce Af) */ + mmC = _mm_unpacklo_pi8(mmD, mmF); /* (B1 C1 B3 C3 B5 C5 B7 C7) */ + mmD = _mm_unpackhi_pi8(mmD, mmF); /* (B9 C9 Bb Cb Bd Cd Bf Cf) */ - mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */ - mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */ - mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */ - mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */ - mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */ - mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */ + mmB = _mm_unpacklo_pi16(mmG, mmA); /* (A0 B0 A8 B8 A2 B2 Aa Ba) */ + mmA = _mm_unpackhi_pi16(mmG, mmA); /* (A4 B4 Ac Bc A6 B6 Ae Be) */ + mmF = _mm_unpacklo_pi16(mmH, mmE); /* (C0 A1 C8 A9 C2 A3 Ca Ab) */ + mmE = _mm_unpackhi_pi16(mmH, mmE); /* (C4 A5 Cc Ad C6 A7 Ce Af) */ + mmH = _mm_unpacklo_pi16(mmC, mmD); /* (B1 C1 B9 C9 B3 C3 Bb Cb) */ + mmG = _mm_unpackhi_pi16(mmC, mmD); /* (B5 C5 Bd Cd B7 C7 Bf Cf) */ - mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */ + mmC = _mm_unpacklo_pi16(mmB, mmF); /* (A0 B0 C0 A1 A8 B8 C8 A9) */ mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT); - mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */ - mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */ - mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */ + mmB = _mm_unpacklo_pi16(mmH, mmB); /* (B1 C1 A2 B2 B9 C9 Aa Ba) */ + mmD = _mm_unpackhi_pi16(mmF, mmH); /* (C2 A3 B3 C3 Ca Ab Bb Cb) */ + mmF = _mm_unpacklo_pi16(mmA, mmE); /* (A4 B4 C4 A5 Ac Bc Cc Ad) */ mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT); - mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */ - mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */ + mmH = _mm_unpacklo_pi16(mmG, mmA); /* (B5 C5 A6 B6 Bd Cd Ae Be) */ + mmG = _mm_unpackhi_pi16(mmE, mmG); /* (C6 A7 B7 C7 Ce Af Bf Cf) */ - mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */ - mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */ - mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */ - mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */ - mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */ - mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */ + mmA = _mm_unpacklo_pi32(mmC, mmB); /* (A0 B0 C0 A1 B1 C1 A2 B2) */ + mmE = _mm_unpackhi_pi32(mmC, mmB); /* (A8 B8 C8 A9 B9 C9 Aa Ba) */ + mmB = _mm_unpacklo_pi32(mmD, mmF); /* (C2 A3 B3 C3 A4 B4 C4 A5) */ + mmF = _mm_unpackhi_pi32(mmD, mmF); /* (Ca Ab Bb Cb Ac Bc Cc Ad) */ + mmC = _mm_unpacklo_pi32(mmH, mmG); /* (B5 C5 A6 B6 C6 A7 B7 C7) */ + mmG = _mm_unpackhi_pi32(mmH, mmG); /* (Bd Cd Ae Be Ce Af Bf Cf) */ if (num_cols >= 8) { if (!(((long)outptr) & 7)) { @@ -367,40 +376,48 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, xe = _mm_xor_si64(xe, xe); xo = _mm_xor_si64(xo, xo); #endif - /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */ - /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */ - /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */ - /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */ - mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ - mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */ - mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ - mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */ + /* NOTE: The values of RGB_RED, RGB_GREEN, and RGB_BLUE determine the + * mapping of components A, B, C, and D to red, green, and blue. + * + * mmA = (A0 A2 A4 A6 A8 Aa Ac Ae) = AE + * mmB = (A1 A3 A5 A7 A9 Ab Ad Af) = AO + * mmC = (B0 B2 B4 B6 B8 Ba Bc Be) = BE + * mmD = (B1 B3 B5 B7 B9 Bb Bd Bf) = BO + * mmE = (C0 C2 C4 C6 C8 Ca Cc Ce) = CE + * mmF = (C1 C3 C5 C7 C9 Cb Cd Cf) = CO + * mmG = (D0 D2 D4 D6 D8 Da Dc De) = DE + * mmH = (D1 D3 D5 D7 D9 Db Dd Df) = DO + */ + mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (A0 B0 A2 B2 A4 B4 A6 B6) */ + mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (A8 B8 Aa Ba Ac Bc Ae Be) */ + mmA = _mm_unpacklo_pi8(mmE, mmG); /* (C0 D0 C2 D2 C4 D4 C6 D6) */ + mmE = _mm_unpackhi_pi8(mmE, mmG); /* (C8 D8 Ca Da Cc Dc Ce De) */ - mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ - mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */ - mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ - mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */ + mmG = _mm_unpacklo_pi8(mmB, mmD); /* (A1 B1 A3 B3 A5 B5 A7 B7) */ + mmB = _mm_unpackhi_pi8(mmB, mmD); /* (A9 B9 Ab Bb Ad Bd Af Bf) */ + mmD = _mm_unpacklo_pi8(mmF, mmH); /* (C1 D1 C3 D3 C5 D5 C7 D7) */ + mmF = _mm_unpackhi_pi8(mmF, mmH); /* (C9 D9 Cb Db Cd Dd Cf Df) */ - mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */ - mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */ - mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */ - mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */ + mmH = _mm_unpacklo_pi16(mm8, mmA); /* (A0 B0 C0 D0 A2 B2 C2 D2) */ + mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (A4 B4 C4 D4 A6 B6 C6 D6) */ + mmA = _mm_unpacklo_pi16(mmG, mmD); /* (A1 B1 C1 D1 A3 B3 C3 D3) */ + mmD = _mm_unpackhi_pi16(mmG, mmD); /* (A5 B5 C5 D5 A7 B7 C7 D7) */ - mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */ - mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */ - mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */ - mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */ + mmG = _mm_unpackhi_pi16(mm9, mmE); /* (Ac Bc Cc Dc Ae Be Ce De) */ + mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (A8 B8 C8 D8 Aa Ba Ca Da) */ + mmE = _mm_unpacklo_pi16(mmB, mmF); /* (A9 B9 C9 D9 Ab Bb Cb Db) */ + mmF = _mm_unpackhi_pi16(mmB, mmF); /* (Ad Bd Cd Dd Af Bf Cf Df) */ - mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */ - mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */ - mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */ - mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */ + mmB = _mm_unpackhi_pi32(mmH, mmA); /* (A2 B2 C2 D2 A3 B3 C3 D3) */ + mmA = _mm_unpacklo_pi32(mmH, mmA); /* (A0 B0 C0 D0 A1 B1 C1 D1) */ + mmC = _mm_unpacklo_pi32(mm8, mmD); /* (A4 B4 C4 D4 A5 B5 C5 D5) */ + mmD = _mm_unpackhi_pi32(mm8, mmD); /* (A6 B6 C6 D6 A7 B7 C7 D7) */ - mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */ - mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */ - mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */ - mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */ + mmH = _mm_unpackhi_pi32(mmG, mmF); /* (Ae Be Ce De Af Bf Cf Df) */ + mmG = _mm_unpacklo_pi32(mmG, mmF); /* (Ac Bc Cc Dc Ad Bd Cd Dd) */ + mmF = _mm_unpackhi_pi32(mm9, mmE); /* (Aa Ba Ca Da Ab Bb Cb Db) */ + mmE = _mm_unpacklo_pi32(mm9, mmE); /* (A8 B8 C8 D8 A9 B9 C9 D9) */ if (num_cols >= 8) { if (!(((long)outptr) & 7)) { @@ -505,17 +522,17 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, decenter = 0.0; decenter = _mm_cmpeq_pi16(decenter, decenter); - decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + decenter = _mm_slli_pi16(decenter, 7); /* { 0xFF80 0xFF80 0xFF80 0xFF80 } */ cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */ crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */ cbl = _mm_add_pi16(cbl, decenter); crl = _mm_add_pi16(crl, decenter); - cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */ - crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */ - bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */ - rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */ + cbl2 = _mm_add_pi16(cbl, cbl); /* 2 * CbL */ + crl2 = _mm_add_pi16(crl, crl); /* 2 * CrL */ + bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2 * CbL * -FIX(0.22800) */ + rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2 * CrL * FIX(0.40200)) */ bl = _mm_add_pi16(bl, PW_ONE); bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */ @@ -523,15 +540,15 @@ void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */ bl = _mm_add_pi16(bl, cbl); - bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */ - rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */ + bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200)) = (B - Y)L */ + rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200)) = (R - Y)L */ gl = _mm_unpacklo_pi16(cbl, crl); gl = _mm_madd_pi16(gl, PW_MF0344_F0285); gl = _mm_add_pi32(gl, PD_ONEHALF); gl = _mm_srai_pi32(gl, SCALEBITS); - gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */ - gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */ + gl = _mm_packs_pi32(gl, zero); /* CbL * -FIX(0.344) + CrL * FIX(0.285) */ + gl = _mm_sub_pi16(gl, crl); /* CbL * -FIX(0.344) + CrL * -FIX(0.714) = (G - Y)L */ yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */ rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */ diff --git a/simd/mips64/jfdctfst-mmi.c b/simd/mips64/jfdctfst-mmi.c index f7caf09a..77a26b18 100644 --- a/simd/mips64/jfdctfst-mmi.c +++ b/simd/mips64/jfdctfst-mmi.c @@ -122,46 +122,46 @@ static uint64_t const_value[] = { __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ \ - row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ - row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ - row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ - row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ - row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ - row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ - row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ - row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ + row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ + row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ + row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ + row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ \ /* Transpose coefficients */ \ \ - row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ - row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ - row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ - row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ + row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a = (20 30 21 31) */ \ + row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b = (22 32 23 33) */ \ + row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c = (24 34 25 35) */ \ + row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d = (26 36 27 37) */ \ \ - row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ - row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ - row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ - row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ + row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a = (00 10 01 11) */ \ + row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b = (02 12 03 13) */ \ + row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c = (04 14 05 15) */ \ + row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d = (06 16 07 17) */ \ \ - col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ - col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ - col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ - col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ + col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0 = (00 10 20 30) */ \ + col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1 = (01 11 21 31) */ \ + col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6 = (06 16 26 36) */ \ + col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7 = (07 17 27 37) */ \ \ - tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ - tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ - tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ - tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ + tmp6 = _mm_sub_pi16(col1, col6); /* tmp6 = col1 - col6 */ \ + tmp7 = _mm_sub_pi16(col0, col7); /* tmp7 = col0 - col7 */ \ + tmp1 = _mm_add_pi16(col1, col6); /* tmp1 = col1 + col6 */ \ + tmp0 = _mm_add_pi16(col0, col7); /* tmp0 = col0 + col7 */ \ \ - col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ - col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ - col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ - col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ + col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2 = (02 12 22 32) */ \ + col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3 = (03 13 23 33) */ \ + col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4 = (04 14 24 34) */ \ + col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5 = (05 15 25 35) */ \ \ - tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ - tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ - tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ - tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ + tmp3 = _mm_add_pi16(col3, col4); /* tmp3 = col3 + col4 */ \ + tmp2 = _mm_add_pi16(col2, col5); /* tmp2 = col2 + col5 */ \ + tmp4 = _mm_sub_pi16(col3, col4); /* tmp4 = col3 - col4 */ \ + tmp5 = _mm_sub_pi16(col2, col5); /* tmp5 = col2 - col5 */ \ \ DO_FDCT_COMMON() \ \ @@ -191,35 +191,35 @@ static uint64_t const_value[] = { \ /* Transpose coefficients */ \ \ - col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ - col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ - col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ - col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ + col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a = (02 03 12 13) */ \ + col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b = (22 23 32 33) */ \ + col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c = (42 43 52 53) */ \ + col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d = (62 63 72 73) */ \ \ - col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ - col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ - col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ - col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ + col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a = (00 01 10 11) */ \ + col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b = (20 21 30 31) */ \ + col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c = (40 41 50 51) */ \ + col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d = (60 61 70 71) */ \ \ - row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ - row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ - row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ - row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ + row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0 = (00 01 02 03) */ \ + row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1 = (10 11 12 13) */ \ + row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6 = (60 61 62 63) */ \ + row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7 = (70 71 72 73) */ \ \ - tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ - tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ - tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ - tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ + tmp6 = _mm_sub_pi16(row1, row6); /* tmp6 = row1 - row6 */ \ + tmp7 = _mm_sub_pi16(row0, row7); /* tmp7 = row0 - row7 */ \ + tmp1 = _mm_add_pi16(row1, row6); /* tmp1 = row1 + row6 */ \ + tmp0 = _mm_add_pi16(row0, row7); /* tmp0 = row0 + row7 */ \ \ - row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ - row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ - row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ - row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ + row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2 = (20 21 22 23) */ \ + row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3 = (30 31 32 33) */ \ + row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4 = (40 41 42 43) */ \ + row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5 = (50 51 52 53) */ \ \ - tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ - tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ - tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ - tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ + tmp3 = _mm_add_pi16(row3, row4); /* tmp3 = row3 + row4 */ \ + tmp2 = _mm_add_pi16(row2, row5); /* tmp2 = row2 + row5 */ \ + tmp4 = _mm_sub_pi16(row3, row4); /* tmp4 = row3 - row4 */ \ + tmp5 = _mm_sub_pi16(row2, row5); /* tmp5 = row2 - row5 */ \ \ DO_FDCT_COMMON() \ \ diff --git a/simd/mips64/jfdctint-mmi.c b/simd/mips64/jfdctint-mmi.c index 7f4dfe91..18a33b04 100644 --- a/simd/mips64/jfdctint-mmi.c +++ b/simd/mips64/jfdctint-mmi.c @@ -237,56 +237,56 @@ static uint64_t const_value[] = { __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ __m64 tmp10, tmp11; \ \ - row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ - row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ - row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ - row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ - row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ - row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ - row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ - row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ + row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ + row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ + row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ + row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ + row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ + row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ + row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ + row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ \ /* Transpose coefficients */ \ \ - row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ - row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ - row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ - row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ + row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a = (20 30 21 31) */ \ + row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b = (22 32 23 33) */ \ + row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c = (24 34 25 35) */ \ + row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d = (26 36 27 37) */ \ \ - row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ - row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ - row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ - row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ + row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a = (00 10 01 11) */ \ + row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b = (02 12 03 13) */ \ + row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c = (04 14 05 15) */ \ + row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d = (06 16 07 17) */ \ \ - col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ - col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ - col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ - col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ + col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0 = (00 10 20 30) */ \ + col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1 = (01 11 21 31) */ \ + col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6 = (06 16 26 36) */ \ + col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7 = (07 17 27 37) */ \ \ - tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ - tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ - tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ - tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ + tmp6 = _mm_sub_pi16(col1, col6); /* tmp6 = col1 - col6 */ \ + tmp7 = _mm_sub_pi16(col0, col7); /* tmp7 = col0 - col7 */ \ + tmp1 = _mm_add_pi16(col1, col6); /* tmp1 = col1 + col6 */ \ + tmp0 = _mm_add_pi16(col0, col7); /* tmp0 = col0 + col7 */ \ \ - col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ - col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ - col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ - col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ + col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2 = (02 12 22 32) */ \ + col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3 = (03 13 23 33) */ \ + col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4 = (04 14 24 34) */ \ + col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5 = (05 15 25 35) */ \ \ - tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ - tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ - tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ - tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ + tmp3 = _mm_add_pi16(col3, col4); /* tmp3 = col3 + col4 */ \ + tmp2 = _mm_add_pi16(col2, col5); /* tmp2 = col2 + col5 */ \ + tmp4 = _mm_sub_pi16(col3, col4); /* tmp4 = col3 - col4 */ \ + tmp5 = _mm_sub_pi16(col2, col5); /* tmp5 = col2 - col5 */ \ \ /* Even part */ \ \ - tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ - tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ - tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ - tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ + tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10 = tmp0 + tmp3 */ \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13 = tmp0 - tmp3 */ \ + tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11 = tmp1 + tmp2 */ \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12 = tmp1 - tmp2 */ \ \ - out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ - out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + out0 = _mm_add_pi16(tmp10, tmp11); /* out0 = tmp10 + tmp11 */ \ + out4 = _mm_sub_pi16(tmp10, tmp11); /* out4 = tmp10 - tmp11 */ \ out0 = _mm_slli_pi16(out0, PASS1_BITS); \ out4 = _mm_slli_pi16(out4, PASS1_BITS); \ \ @@ -319,45 +319,45 @@ static uint64_t const_value[] = { \ /* Transpose coefficients */ \ \ - col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ - col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ - col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ - col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ + col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a = (02 03 12 13) */ \ + col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b = (22 23 32 33) */ \ + col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c = (42 43 52 53) */ \ + col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d = (62 63 72 73) */ \ \ - col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ - col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ - col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ - col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ + col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a = (00 01 10 11) */ \ + col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b = (20 21 30 31) */ \ + col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c = (40 41 50 51) */ \ + col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d = (60 61 70 71) */ \ \ - row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ - row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ - row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ - row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ + row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0 = (00 01 02 03) */ \ + row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1 = (10 11 12 13) */ \ + row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6 = (60 61 62 63) */ \ + row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7 = (70 71 72 73) */ \ \ - tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ - tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ - tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ - tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ + tmp6 = _mm_sub_pi16(row1, row6); /* tmp6 = row1 - row6 */ \ + tmp7 = _mm_sub_pi16(row0, row7); /* tmp7 = row0 - row7 */ \ + tmp1 = _mm_add_pi16(row1, row6); /* tmp1 = row1 + row6 */ \ + tmp0 = _mm_add_pi16(row0, row7); /* tmp0 = row0 + row7 */ \ \ - row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ - row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ - row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ - row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ + row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2 = (20 21 22 23) */ \ + row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3 = (30 31 32 33) */ \ + row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4 = (40 41 42 43) */ \ + row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5 = (50 51 52 53) */ \ \ - tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ - tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ - tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ - tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ + tmp3 = _mm_add_pi16(row3, row4); /* tmp3 = row3 + row4 */ \ + tmp2 = _mm_add_pi16(row2, row5); /* tmp2 = row2 + row5 */ \ + tmp4 = _mm_sub_pi16(row3, row4); /* tmp4 = row3 - row4 */ \ + tmp5 = _mm_sub_pi16(row2, row5); /* tmp5 = row2 - row5 */ \ \ /* Even part */ \ \ - tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ - tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ - tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ - tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ + tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10 = tmp0 + tmp3 */ \ + tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13 = tmp0 - tmp3 */ \ + tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11 = tmp1 + tmp2 */ \ + tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12 = tmp1 - tmp2 */ \ \ - out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ - out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + out0 = _mm_add_pi16(tmp10, tmp11); /* out0 = tmp10 + tmp11 */ \ + out4 = _mm_sub_pi16(tmp10, tmp11); /* out4 = tmp10 - tmp11 */ \ \ out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \ out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \ diff --git a/simd/mips64/jidctfst-mmi.c b/simd/mips64/jidctfst-mmi.c index 503bb35a..acd090fd 100644 --- a/simd/mips64/jidctfst-mmi.c +++ b/simd/mips64/jidctfst-mmi.c @@ -34,11 +34,11 @@ #define CONST_BITS 8 #define PASS1_BITS 2 -#define FIX_1_082 ((short)277) /* FIX(1.082392200) */ -#define FIX_1_414 ((short)362) /* FIX(1.414213562) */ -#define FIX_1_847 ((short)473) /* FIX(1.847759065) */ -#define FIX_2_613 ((short)669) /* FIX(2.613125930) */ -#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */ +#define FIX_1_082 ((short)277) /* FIX(1.082392200) */ +#define FIX_1_414 ((short)362) /* FIX(1.414213562) */ +#define FIX_1_847 ((short)473) /* FIX(1.847759065) */ +#define FIX_2_613 ((short)669) /* FIX(2.613125930) */ +#define FIX_1_613 ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */ #define PRE_MULTIPLY_SCALE_BITS 2 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) @@ -156,15 +156,15 @@ static uint64_t const_value[] = { \ quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ \ - dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval=(00 10 20 30) */ \ + dcval = _mm_mullo_pi16(col0l, quant0l); /* dcval = (00 10 20 30) */ \ \ - dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ - dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ + dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall = (00 00 10 10) */ \ + dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh = (20 20 30 30) */ \ \ - row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ - row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ - row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ - row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ + row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0 = (00 00 00 00) */ \ + row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1 = (10 10 10 10) */ \ + row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2 = (20 20 20 20) */ \ + row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3 = (30 30 30 30) */ \ \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ @@ -234,32 +234,33 @@ static uint64_t const_value[] = { \ DO_IDCT_COMMON() \ \ - /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ - /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ - /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ - /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ + /* out0 = (00 10 20 30), out1 = (01 11 21 31) \ + * out2 = (02 12 22 32), out3 = (03 13 23 33) \ + * out4 = (04 14 24 34), out5 = (05 15 25 35) \ + * out6 = (06 16 26 36), out7 = (07 17 27 37) \ + */ \ \ /* Transpose coefficients */ \ \ - row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ - row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ - row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ - row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ + row01a = _mm_unpacklo_pi16(out0, out1); /* row01a = (00 01 10 11) */ \ + row23a = _mm_unpackhi_pi16(out0, out1); /* row23a = (20 21 30 31) */ \ + row01d = _mm_unpacklo_pi16(out6, out7); /* row01d = (06 07 16 17) */ \ + row23d = _mm_unpackhi_pi16(out6, out7); /* row23d = (26 27 36 37) */ \ \ - row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ - row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ - row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ - row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ + row01b = _mm_unpacklo_pi16(out2, out3); /* row01b = (02 03 12 13) */ \ + row23b = _mm_unpackhi_pi16(out2, out3); /* row23b = (22 23 32 33) */ \ + row01c = _mm_unpacklo_pi16(out4, out5); /* row01c = (04 05 14 15) */ \ + row23c = _mm_unpackhi_pi16(out4, out5); /* row23c = (24 25 34 35) */ \ \ - row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ - row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ - row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ - row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ + row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l = (00 01 02 03) */ \ + row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l = (10 11 12 13) */ \ + row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l = (20 21 22 23) */ \ + row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l = (30 31 32 33) */ \ \ - row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ - row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ - row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ - row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ + row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h = (04 05 06 07) */ \ + row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h = (14 15 16 17) */ \ + row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h = (24 25 26 27) */ \ + row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h = (34 35 36 37) */ \ \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ @@ -312,11 +313,11 @@ static uint64_t const_value[] = { \ DO_IDCT_COMMON() \ \ - /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ - /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ - /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ - /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ - \ + /* out0 = (00 01 02 03), out1 = (10 11 12 13) \ + * out2 = (20 21 22 23), out3 = (30 31 32 33) \ + * out4 = (40 41 42 43), out5 = (50 51 52 53) \ + * out6 = (60 61 62 63), out7 = (70 71 72 73) \ + */ \ out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \ out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \ out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \ @@ -326,10 +327,10 @@ static uint64_t const_value[] = { out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \ out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \ \ - row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ - row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ - row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ - row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ + row06 = _mm_packs_pi16(out0, out6); /* row06 = (00 01 02 03 60 61 62 63) */ \ + row17 = _mm_packs_pi16(out1, out7); /* row17 = (10 11 12 13 70 71 72 73) */ \ + row24 = _mm_packs_pi16(out2, out4); /* row24 = (20 21 22 23 40 41 42 43) */ \ + row35 = _mm_packs_pi16(out3, out5); /* row35 = (30 31 32 33 50 51 52 53) */ \ \ row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ @@ -338,20 +339,20 @@ static uint64_t const_value[] = { \ /* Transpose coefficients */ \ \ - col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ - col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ - col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ - col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ + col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a = (00 10 01 11 02 12 03 13) */ \ + col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d = (60 70 61 71 62 72 63 73) */ \ + col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b = (20 30 21 31 22 32 23 33) */ \ + col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c = (40 50 41 51 42 52 43 53) */ \ \ - col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ - col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ - col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ - col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ + col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l = (00 10 20 30 01 11 21 31) */ \ + col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l = (02 12 22 32 03 13 23 33) */ \ + col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h = (40 50 60 70 41 51 61 71) */ \ + col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h = (42 52 62 72 43 53 63 73) */ \ \ - col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ - col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ - col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ - col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ + col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0 = (00 10 20 30 40 50 60 70) */ \ + col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1 = (01 11 21 31 41 51 61 71) */ \ + col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2 = (02 12 22 32 42 52 62 72) */ \ + col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3 = (03 13 23 33 43 53 63 73) */ \ \ _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ diff --git a/simd/mips64/jidctint-mmi.c b/simd/mips64/jidctint-mmi.c index 18bc7f95..09fbb2e9 100644 --- a/simd/mips64/jidctint-mmi.c +++ b/simd/mips64/jidctint-mmi.c @@ -40,18 +40,18 @@ #define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) #define CENTERJSAMPLE 128 -#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ -#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ -#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ -#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ -#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ -#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ -#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ -#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ -#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ -#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ -#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ -#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ +#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ +#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ +#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ +#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ +#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ +#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ +#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ +#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ +#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ +#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ +#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ +#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ enum const_index { index_PW_F130_F054, @@ -293,15 +293,15 @@ static uint64_t const_value[] = { quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ \ dcval = _mm_mullo_pi16(col0l, quant0l); \ - dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \ + dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval = (00 10 20 30) */ \ \ - dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ - dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ + dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall = (00 00 10 10) */ \ + dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh = (20 20 30 30) */ \ \ - row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ - row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ - row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ - row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ + row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0 = (00 00 00 00) */ \ + row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1 = (10 10 10 10) */ \ + row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2 = (20 20 20 20) */ \ + row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3 = (30 30 30 30) */ \ \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ @@ -392,32 +392,33 @@ static uint64_t const_value[] = { \ DO_IDCT_COMMON(1) \ \ - /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ - /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ - /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ - /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ + /* out0 = (00 10 20 30), out1 = (01 11 21 31) \ + * out2 = (02 12 22 32), out3 = (03 13 23 33) \ + * out4 = (04 14 24 34), out5 = (05 15 25 35) \ + * out6 = (06 16 26 36), out7 = (07 17 27 37) \ + */ \ \ /* Transpose coefficients */ \ \ - row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ - row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ - row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ - row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ + row01a = _mm_unpacklo_pi16(out0, out1); /* row01a = (00 01 10 11) */ \ + row23a = _mm_unpackhi_pi16(out0, out1); /* row23a = (20 21 30 31) */ \ + row01d = _mm_unpacklo_pi16(out6, out7); /* row01d = (06 07 16 17) */ \ + row23d = _mm_unpackhi_pi16(out6, out7); /* row23d = (26 27 36 37) */ \ \ - row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ - row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ - row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ - row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ + row01b = _mm_unpacklo_pi16(out2, out3); /* row01b = (02 03 12 13) */ \ + row23b = _mm_unpackhi_pi16(out2, out3); /* row23b = (22 23 32 33) */ \ + row01c = _mm_unpacklo_pi16(out4, out5); /* row01c = (04 05 14 15) */ \ + row23c = _mm_unpackhi_pi16(out4, out5); /* row23c = (24 25 34 35) */ \ \ - row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ - row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ - row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ - row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ + row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l = (00 01 02 03) */ \ + row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l = (10 11 12 13) */ \ + row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l = (20 21 22 23) */ \ + row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l = (30 31 32 33) */ \ \ - row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ - row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ - row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ - row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ + row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h = (04 05 06 07) */ \ + row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h = (14 15 16 17) */ \ + row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h = (24 25 26 27) */ \ + row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h = (34 35 36 37) */ \ \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ @@ -497,15 +498,15 @@ static uint64_t const_value[] = { \ DO_IDCT_COMMON(2) \ \ - /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ - /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ - /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ - /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ - \ - row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ - row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ - row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ - row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ + /* out0 = (00 01 02 03), out1 = (10 11 12 13) \ + * out2 = (20 21 22 23), out3 = (30 31 32 33) \ + * out4 = (40 41 42 43), out5 = (50 51 52 53) \ + * out6 = (60 61 62 63), out7 = (70 71 72 73) \ + */ \ + row06 = _mm_packs_pi16(out0, out6); /* row06 = (00 01 02 03 60 61 62 63) */ \ + row17 = _mm_packs_pi16(out1, out7); /* row17 = (10 11 12 13 70 71 72 73) */ \ + row24 = _mm_packs_pi16(out2, out4); /* row24 = (20 21 22 23 40 41 42 43) */ \ + row35 = _mm_packs_pi16(out3, out5); /* row35 = (30 31 32 33 50 51 52 53) */ \ \ row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ @@ -514,20 +515,20 @@ static uint64_t const_value[] = { \ /* Transpose coefficients */ \ \ - col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ - col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ - col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ - col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ + col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a = (00 10 01 11 02 12 03 13) */ \ + col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d = (60 70 61 71 62 72 63 73) */ \ + col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b = (20 30 21 31 22 32 23 33) */ \ + col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c = (40 50 41 51 42 52 43 53) */ \ \ - col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ - col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ - col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ - col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ + col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l = (00 10 20 30 01 11 21 31) */ \ + col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l = (02 12 22 32 03 13 23 33) */ \ + col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h = (40 50 60 70 41 51 61 71) */ \ + col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h = (42 52 62 72 43 53 63 73) */ \ \ - col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ - col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ - col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ - col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ + col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0 = (00 10 20 30 40 50 60 70) */ \ + col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1 = (01 11 21 31 41 51 61 71) */ \ + col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2 = (02 12 22 32 42 52 62 72) */ \ + col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3 = (03 13 23 33 43 53 63 73) */ \ \ _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ diff --git a/simd/mips64/jquanti-mmi.c b/simd/mips64/jquanti-mmi.c index 339002fd..59b80e07 100644 --- a/simd/mips64/jquanti-mmi.c +++ b/simd/mips64/jquanti-mmi.c @@ -6,7 +6,7 @@ * Authors: ZhuChen * CaiWanwei * SunZhangzhi - * Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2018-2019, 2025, D. R. Commander. All Rights Reserved. * * Based on the x86 SIMD extension for IJG JPEG library * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -62,9 +62,10 @@ rowl = _mm_mulhi_pi16(rowl, recipl); \ rowh = _mm_mulhi_pi16(rowh, reciph); \ \ - /* reciprocal is always negative (MSB=1), so we always need to add the */ \ - /* initial value (input value is never negative as we inverted it at the */ \ - /* start of this routine) */ \ + /* reciprocal is always negative (MSB = 1), so we always need to add the \ + * initial value. (The input value is never negative, as we inverted it at \ + * the start of this routine.) \ + */ \ rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \ rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \ \