sdl

FORK: Simple Directmedia Layer
git clone https://git.neptards.moe/neptards/sdl.git
Log | Files | Refs

yuv_rgb_sse_func.h (19011B)


      1 // Copyright 2016 Adrien Descamps
      2 // Distributed under BSD 3-Clause License
      3 
      4 /* You need to define the following macros before including this file:
      5 	SSE_FUNCTION_NAME
      6 	STD_FUNCTION_NAME
      7 	YUV_FORMAT
      8 	RGB_FORMAT
      9 */
     10 /* You may define the following macro, which affects generated code:
     11 	SSE_ALIGNED
     12 */
     13 
     14 #ifdef SSE_ALIGNED
     15 /* Unaligned instructions seem faster, even on aligned data? */
     16 /*
     17 #define LOAD_SI128 _mm_load_si128
     18 #define SAVE_SI128 _mm_stream_si128
     19 */
     20 #define LOAD_SI128 _mm_loadu_si128
     21 #define SAVE_SI128 _mm_storeu_si128
     22 #else
     23 #define LOAD_SI128 _mm_loadu_si128
     24 #define SAVE_SI128 _mm_storeu_si128
     25 #endif
     26 
     27 #define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
     28 	r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
     29 	g_tmp = _mm_add_epi16( \
     30 		_mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
     31 		_mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
     32 	b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
     33 	R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
     34 	G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
     35 	B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
     36 	R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
     37 	G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
     38 	B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
     39 
     40 #define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
     41 	Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
     42 	Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
     43 	\
     44 	R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
     45 	G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
     46 	B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
     47 	R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
     48 	G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
     49 	B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
     50 
     51 #define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
     52 { \
     53 	__m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
     54 \
     55 	red_mask = _mm_set1_epi16((short)0xF800); \
     56 	RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
     57 	RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
     58 	RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
     59 	RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
     60 	tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
     61 	tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
     62 	tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
     63 	tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
     64 	RGB1 = _mm_or_si128(RGB1, tmp1); \
     65 	RGB2 = _mm_or_si128(RGB2, tmp2); \
     66 	RGB3 = _mm_or_si128(RGB3, tmp3); \
     67 	RGB4 = _mm_or_si128(RGB4, tmp4); \
     68 	tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
     69 	tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
     70 	tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
     71 	tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
     72 	RGB1 = _mm_or_si128(RGB1, tmp1); \
     73 	RGB2 = _mm_or_si128(RGB2, tmp2); \
     74 	RGB3 = _mm_or_si128(RGB3, tmp3); \
     75 	RGB4 = _mm_or_si128(RGB4, tmp4); \
     76 }
     77 
     78 #define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     79 RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
     80 RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
     81 RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
     82 RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
     83 RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
     84 RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
     85 
     86 #define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     87 R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
     88 R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
     89 G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
     90 G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
     91 B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
     92 B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
     93 
     94 #define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     95 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     96 PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     97 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     98 PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
     99 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
    100 
    101 #define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
    102 { \
    103 	__m128i lo_ab, hi_ab, lo_gr, hi_gr; \
    104 \
    105 	lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
    106 	hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
    107 	lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
    108 	hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
    109 	RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
    110 	RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
    111 	RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
    112 	RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
    113 \
    114 	lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
    115 	hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
    116 	lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
    117 	hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
    118 	RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
    119 	RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
    120 	RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
    121 	RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
    122 }
    123 
    124 #if RGB_FORMAT == RGB_FORMAT_RGB565
    125 
    126 #define PACK_PIXEL \
    127 	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
    128 	\
    129 	PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
    130 	\
    131 	PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
    132 
    133 #elif RGB_FORMAT == RGB_FORMAT_RGB24
    134 
    135 #define PACK_PIXEL \
    136 	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
    137 	__m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
    138 	\
    139 	PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
    140 	\
    141 	PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
    142 
    143 #elif RGB_FORMAT == RGB_FORMAT_RGBA
    144 
    145 #define PACK_PIXEL \
    146 	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
    147 	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
    148 	__m128i a = _mm_set1_epi8((char)0xFF); \
    149 	\
    150 	PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
    151 	\
    152 	PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
    153 
    154 #elif RGB_FORMAT == RGB_FORMAT_BGRA
    155 
    156 #define PACK_PIXEL \
    157 	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
    158 	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
    159 	__m128i a = _mm_set1_epi8((char)0xFF); \
    160 	\
    161 	PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
    162 	\
    163 	PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
    164 
    165 #elif RGB_FORMAT == RGB_FORMAT_ARGB
    166 
    167 #define PACK_PIXEL \
    168 	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
    169 	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
    170 	__m128i a = _mm_set1_epi8((char)0xFF); \
    171 	\
    172 	PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
    173 	\
    174 	PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
    175 
    176 #elif RGB_FORMAT == RGB_FORMAT_ABGR
    177 
    178 #define PACK_PIXEL \
    179 	__m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
    180 	__m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
    181 	__m128i a = _mm_set1_epi8((char)0xFF); \
    182 	\
    183 	PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
    184 	\
    185 	PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
    186 
    187 #else
    188 #error PACK_PIXEL unimplemented
    189 #endif
    190 
    191 #if RGB_FORMAT == RGB_FORMAT_RGB565
    192 
    193 #define SAVE_LINE1 \
    194 	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
    195 	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
    196 	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
    197 	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
    198 
    199 #define SAVE_LINE2 \
    200 	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
    201 	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
    202 	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
    203 	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
    204 
    205 #elif RGB_FORMAT == RGB_FORMAT_RGB24
    206 
    207 #define SAVE_LINE1 \
    208 	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
    209 	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
    210 	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
    211 	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
    212 	SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
    213 	SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
    214 
    215 #define SAVE_LINE2 \
    216 	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
    217 	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
    218 	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
    219 	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
    220 	SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
    221 	SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
    222 
    223 #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
    224       RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
    225 
    226 #define SAVE_LINE1 \
    227 	SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
    228 	SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
    229 	SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
    230 	SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
    231 	SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
    232 	SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
    233 	SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
    234 	SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
    235 
    236 #define SAVE_LINE2 \
    237 	SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
    238 	SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
    239 	SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
    240 	SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
    241 	SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
    242 	SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
    243 	SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
    244 	SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
    245 
    246 #else
    247 #error SAVE_LINE unimplemented
    248 #endif
    249 
    250 #if YUV_FORMAT == YUV_FORMAT_420
    251 
    252 #define READ_Y(y_ptr) \
    253 	y = LOAD_SI128((const __m128i*)(y_ptr)); \
    254 
    255 #define READ_UV	\
    256 	u = LOAD_SI128((const __m128i*)(u_ptr)); \
    257 	v = LOAD_SI128((const __m128i*)(v_ptr)); \
    258 
    259 #elif YUV_FORMAT == YUV_FORMAT_422
    260 
    261 #define READ_Y(y_ptr) \
    262 { \
    263 	__m128i y1, y2; \
    264 	y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
    265 	y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
    266 	y = _mm_packus_epi16(y1, y2); \
    267 }
    268 
    269 #define READ_UV	\
    270 { \
    271 	__m128i u1, u2, u3, u4, v1, v2, v3, v4; \
    272 	u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
    273 	u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
    274 	u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
    275 	u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
    276 	u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
    277 	v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
    278 	v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
    279 	v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
    280 	v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
    281 	v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
    282 }
    283 
    284 #elif YUV_FORMAT == YUV_FORMAT_NV12
    285 
    286 #define READ_Y(y_ptr) \
    287 	y = LOAD_SI128((const __m128i*)(y_ptr)); \
    288 
    289 #define READ_UV	\
    290 { \
    291 	__m128i u1, u2, v1, v2; \
    292 	u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
    293 	u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
    294 	u = _mm_packus_epi16(u1, u2); \
    295 	v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
    296 	v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
    297 	v = _mm_packus_epi16(v1, v2); \
    298 }
    299 
    300 #else
    301 #error READ_UV unimplemented
    302 #endif
    303 
    304 #define YUV2RGB_32 \
    305 	__m128i r_tmp, g_tmp, b_tmp; \
    306 	__m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
    307 	__m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
    308 	__m128i y_16_1, y_16_2; \
    309 	__m128i y, u, v, u_16, v_16; \
    310     __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
    311     __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
    312 	\
    313 	READ_UV \
    314 	\
    315 	/* process first 16 pixels of first line */\
    316 	u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
    317 	v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
    318 	u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
    319 	v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
    320 	\
    321 	UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
    322 	r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
    323 	r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
    324 	\
    325 	READ_Y(y_ptr1) \
    326 	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
    327 	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
    328 	\
    329 	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
    330 	\
    331 	r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
    332 	g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
    333 	b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
    334 	\
    335 	/* process first 16 pixels of second line */\
    336 	r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
    337 	r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
    338 	\
    339 	READ_Y(y_ptr2) \
    340 	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
    341 	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
    342 	\
    343 	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
    344 	\
    345 	r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
    346 	g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
    347 	b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
    348 	\
    349 	/* process last 16 pixels of first line */\
    350 	u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
    351 	v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
    352 	u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
    353 	v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
    354 	\
    355 	UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
    356 	r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
    357 	r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
    358 	\
    359 	READ_Y(y_ptr1+16*y_pixel_stride) \
    360 	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
    361 	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
    362 	\
    363 	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
    364 	\
    365 	r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
    366 	g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
    367 	b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
    368 	\
    369 	/* process last 16 pixels of second line */\
    370 	r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
    371 	r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
    372 	\
    373 	READ_Y(y_ptr2+16*y_pixel_stride) \
    374 	y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
    375 	y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
    376 	\
    377 	ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
    378 	\
    379 	r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
    380 	g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
    381 	b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
    382 	\
    383 
    384 
    385 void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, 
    386 	const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 
    387 	uint8_t *RGB, uint32_t RGB_stride, 
    388 	YCbCrType yuv_type)
    389 {
    390 	const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
    391 #if YUV_FORMAT == YUV_FORMAT_420
    392 	const int y_pixel_stride = 1;
    393 	const int uv_pixel_stride = 1;
    394 	const int uv_x_sample_interval = 2;
    395 	const int uv_y_sample_interval = 2;
    396 #elif YUV_FORMAT == YUV_FORMAT_422
    397 	const int y_pixel_stride = 2;
    398 	const int uv_pixel_stride = 4;
    399 	const int uv_x_sample_interval = 2;
    400 	const int uv_y_sample_interval = 1;
    401 #elif YUV_FORMAT == YUV_FORMAT_NV12
    402 	const int y_pixel_stride = 1;
    403 	const int uv_pixel_stride = 2;
    404 	const int uv_x_sample_interval = 2;
    405 	const int uv_y_sample_interval = 2;
    406 #endif
    407 #if RGB_FORMAT == RGB_FORMAT_RGB565
    408 	const int rgb_pixel_stride = 2;
    409 #elif RGB_FORMAT == RGB_FORMAT_RGB24
    410 	const int rgb_pixel_stride = 3;
    411 #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
    412       RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
    413 	const int rgb_pixel_stride = 4;
    414 #else
    415 #error Unknown RGB pixel size
    416 #endif
    417 
    418 	if (width >= 32) {
    419 		uint32_t xpos, ypos;
    420 		for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval)
    421 		{
    422 			const uint8_t *y_ptr1=Y+ypos*Y_stride,
    423 				*y_ptr2=Y+(ypos+1)*Y_stride,
    424 				*u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
    425 				*v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
    426 			
    427 			uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
    428 				*rgb_ptr2=RGB+(ypos+1)*RGB_stride;
    429 			
    430 			for(xpos=0; xpos<(width-31); xpos+=32)
    431 			{
    432 				YUV2RGB_32
    433 				{
    434 					PACK_PIXEL
    435 					SAVE_LINE1
    436 					if (uv_y_sample_interval > 1)
    437 					{
    438 						SAVE_LINE2
    439 					}
    440 				}
    441 
    442 				y_ptr1+=32*y_pixel_stride;
    443 				y_ptr2+=32*y_pixel_stride;
    444 				u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
    445 				v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
    446 				rgb_ptr1+=32*rgb_pixel_stride;
    447 				rgb_ptr2+=32*rgb_pixel_stride;
    448 			}
    449 		}
    450 
    451 		/* Catch the last line, if needed */
    452 		if (uv_y_sample_interval == 2 && ypos == (height-1))
    453 		{
    454 			const uint8_t *y_ptr=Y+ypos*Y_stride,
    455 				*u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
    456 				*v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
    457 			
    458 			uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
    459 
    460 			STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
    461 		}
    462 	}
    463 
    464 	/* Catch the right column, if needed */
    465 	{
    466 		int converted = (width & ~31);
    467 		if (converted != width)
    468 		{
    469 			const uint8_t *y_ptr=Y+converted*y_pixel_stride,
    470 				*u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
    471 				*v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
    472 			
    473 			uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
    474 
    475 			STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
    476 		}
    477 	}
    478 }
    479 
    480 #undef SSE_FUNCTION_NAME
    481 #undef STD_FUNCTION_NAME
    482 #undef YUV_FORMAT
    483 #undef RGB_FORMAT
    484 #undef SSE_ALIGNED
    485 #undef LOAD_SI128
    486 #undef SAVE_SI128
    487 #undef UV2RGB_16
    488 #undef ADD_Y2RGB_16
    489 #undef PACK_RGB24_32_STEP1
    490 #undef PACK_RGB24_32_STEP2
    491 #undef PACK_RGB24_32
    492 #undef PACK_RGBA_32
    493 #undef PACK_PIXEL
    494 #undef SAVE_LINE1
    495 #undef SAVE_LINE2
    496 #undef READ_Y
    497 #undef READ_UV
    498 #undef YUV2RGB_32