|
|
|
|
@@ -9,6 +9,12 @@
|
|
|
|
|
|
|
|
|
|
typedef __i64x2 __m128i;
|
|
|
|
|
#define _mm_load_si128 wasm_v128_load
|
|
|
|
|
#define _mm_add_epi8 wasm_i8x16_add
|
|
|
|
|
#define _mm_set1_epi8 wasm_i8x16_splat
|
|
|
|
|
#define _mm_srai_epi16 wasm_i16x8_shr
|
|
|
|
|
#define _mm_mullo_epi16 wasm_i16x8_mul
|
|
|
|
|
#define _mm_sub_epi8 wasm_i8x16_sub
|
|
|
|
|
#define _mm_setzero_si128 wasm_i64x2_const
|
|
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
|
|
@@ -433,493 +439,6 @@ void nv21_rgb24_std(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef __SSE2__
|
|
|
|
|
|
|
|
|
|
//see rgb.txt
|
|
|
|
|
#define UNPACK_RGB24_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RD1, RD2, RD3, RD4, RD5, RD6) \
|
|
|
|
|
RD1 = _mm_unpacklo_epi8(RS1, RS4); \
|
|
|
|
|
RD2 = _mm_unpackhi_epi8(RS1, RS4); \
|
|
|
|
|
RD3 = _mm_unpacklo_epi8(RS2, RS5); \
|
|
|
|
|
RD4 = _mm_unpackhi_epi8(RS2, RS5); \
|
|
|
|
|
RD5 = _mm_unpacklo_epi8(RS3, RS6); \
|
|
|
|
|
RD6 = _mm_unpackhi_epi8(RS3, RS6);
|
|
|
|
|
|
|
|
|
|
#define RGB2YUV_16(R, G, B, Y, U, V) \
|
|
|
|
|
Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(G, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
Y = _mm_srli_epi16(Y, 8); \
|
|
|
|
|
U = _mm_mullo_epi16(_mm_sub_epi16(B, Y), _mm_set1_epi16(param->cb_factor)); \
|
|
|
|
|
U = _mm_add_epi16(_mm_srai_epi16(U, 8), _mm_set1_epi16(128)); \
|
|
|
|
|
V = _mm_mullo_epi16(_mm_sub_epi16(R, Y), _mm_set1_epi16(param->cr_factor)); \
|
|
|
|
|
V = _mm_add_epi16(_mm_srai_epi16(V, 8), _mm_set1_epi16(128)); \
|
|
|
|
|
Y = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(Y, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset));
|
|
|
|
|
|
|
|
|
|
#define RGB2YUV_32 \
|
|
|
|
|
__m128i r_16, g_16, b_16; \
|
|
|
|
|
__m128i y1_16, y2_16, cb1_16, cb2_16, cr1_16, cr2_16, Y, cb, cr; \
|
|
|
|
|
__m128i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; \
|
|
|
|
|
__m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
|
|
|
|
|
rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
|
|
|
|
|
rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
|
|
|
|
|
rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
|
|
|
|
|
rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
|
|
|
|
|
rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
|
|
|
|
|
/* unpack rgb24 data to r, g and b data in separate channels*/ \
|
|
|
|
|
/* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \
|
|
|
|
|
/* here, because averaging in horizontal direction is easier like this*/ \
|
|
|
|
|
/* The last step is applied further on the Y channel only*/ \
|
|
|
|
|
UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \
|
|
|
|
|
UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \
|
|
|
|
|
UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \
|
|
|
|
|
UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \
|
|
|
|
|
/* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb1_16 = _mm_sub_epi16(b_16, y1_16); \
|
|
|
|
|
cr1_16 = _mm_sub_epi16(r_16, y1_16); \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb4, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr1), Y); \
|
|
|
|
|
/* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y1_16)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y1_16)); \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb4, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr2), Y); \
|
|
|
|
|
/* Rescale Cb and Cr to their final range */ \
|
|
|
|
|
cb1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb1_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr1_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
\
|
|
|
|
|
/* do the same again with next data */ \
|
|
|
|
|
rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)), \
|
|
|
|
|
rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)), \
|
|
|
|
|
rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)), \
|
|
|
|
|
rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)), \
|
|
|
|
|
rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)), \
|
|
|
|
|
rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
|
|
|
|
|
/* unpack rgb24 data to r, g and b data in separate channels*/ \
|
|
|
|
|
/* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \
|
|
|
|
|
/* here, because averaging in horizontal direction is easier like this*/ \
|
|
|
|
|
/* The last step is applied further on the Y channel only*/ \
|
|
|
|
|
UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \
|
|
|
|
|
UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \
|
|
|
|
|
UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \
|
|
|
|
|
UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \
|
|
|
|
|
/* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb2_16 = _mm_sub_epi16(b_16, y1_16); \
|
|
|
|
|
cr2_16 = _mm_sub_epi16(r_16, y1_16); \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb4, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr1+16), Y); \
|
|
|
|
|
/* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y1_16)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y1_16)); \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb4, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr2+16), Y); \
|
|
|
|
|
/* Rescale Cb and Cr to their final range */ \
|
|
|
|
|
cb2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb2_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr2_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
/* Pack and save Cb Cr */ \
|
|
|
|
|
cb = _mm_packus_epi16(cb1_16, cb2_16); \
|
|
|
|
|
cr = _mm_packus_epi16(cr1_16, cr2_16); \
|
|
|
|
|
SAVE_SI128((__m128i*)(u_ptr), cb); \
|
|
|
|
|
SAVE_SI128((__m128i*)(v_ptr), cr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void rgb24_yuv420_sse(uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *RGB, uint32_t RGB_stride,
|
|
|
|
|
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_load_si128
|
|
|
|
|
#define SAVE_SI128 _mm_stream_si128
|
|
|
|
|
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
|
|
|
|
|
*rgb_ptr2=RGB+(y+1)*RGB_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*u_ptr=U+(y/2)*UV_stride,
|
|
|
|
|
*v_ptr=V+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
RGB2YUV_32
|
|
|
|
|
|
|
|
|
|
rgb_ptr1+=96;
|
|
|
|
|
rgb_ptr2+=96;
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
u_ptr+=16;
|
|
|
|
|
v_ptr+=16;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *RGB, uint32_t RGB_stride,
|
|
|
|
|
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_loadu_si128
|
|
|
|
|
#define SAVE_SI128 _mm_storeu_si128
|
|
|
|
|
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
|
|
|
|
|
*rgb_ptr2=RGB+(y+1)*RGB_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*u_ptr=U+(y/2)*UV_stride,
|
|
|
|
|
*v_ptr=V+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
RGB2YUV_32
|
|
|
|
|
|
|
|
|
|
rgb_ptr1+=96;
|
|
|
|
|
rgb_ptr2+=96;
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
u_ptr+=16;
|
|
|
|
|
v_ptr+=16;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// see rgba.txt
|
|
|
|
|
#define UNPACK_RGB32_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RS7, RS8, RD1, RD2, RD3, RD4, RD5, RD6, RD7, RD8) \
|
|
|
|
|
RD1 = _mm_unpacklo_epi8(RS1, RS5); \
|
|
|
|
|
RD2 = _mm_unpackhi_epi8(RS1, RS5); \
|
|
|
|
|
RD3 = _mm_unpacklo_epi8(RS2, RS6); \
|
|
|
|
|
RD4 = _mm_unpackhi_epi8(RS2, RS6); \
|
|
|
|
|
RD5 = _mm_unpacklo_epi8(RS3, RS7); \
|
|
|
|
|
RD6 = _mm_unpackhi_epi8(RS3, RS7); \
|
|
|
|
|
RD7 = _mm_unpacklo_epi8(RS4, RS8); \
|
|
|
|
|
RD8 = _mm_unpackhi_epi8(RS4, RS8);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define RGBA2YUV_32 \
|
|
|
|
|
__m128i r_16, g_16, b_16; \
|
|
|
|
|
__m128i y1_16, y2_16, cb1_16, cb2_16, cr1_16, cr2_16, Y, cb, cr; \
|
|
|
|
|
__m128i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; \
|
|
|
|
|
__m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
|
|
|
|
|
rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
|
|
|
|
|
rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
|
|
|
|
|
rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)), \
|
|
|
|
|
rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
|
|
|
|
|
rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
|
|
|
|
|
rgb7 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)), \
|
|
|
|
|
rgb8 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
|
|
|
|
|
/* unpack rgb24 data to r, g and b data in separate channels*/ \
|
|
|
|
|
/* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \
|
|
|
|
|
/* here, because averaging in horizontal direction is easier like this*/ \
|
|
|
|
|
/* The last step is applied further on the Y channel only*/ \
|
|
|
|
|
UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \
|
|
|
|
|
UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \
|
|
|
|
|
UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \
|
|
|
|
|
UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \
|
|
|
|
|
/* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb1_16 = _mm_sub_epi16(b_16, y1_16); \
|
|
|
|
|
cr1_16 = _mm_sub_epi16(r_16, y1_16); \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb7, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr1), Y); \
|
|
|
|
|
/* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y1_16)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y1_16)); \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb7, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr2), Y); \
|
|
|
|
|
/* Rescale Cb and Cr to their final range */ \
|
|
|
|
|
cb1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb1_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
cr1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr1_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
\
|
|
|
|
|
/* do the same again with next data */ \
|
|
|
|
|
rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)), \
|
|
|
|
|
rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)), \
|
|
|
|
|
rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+96)), \
|
|
|
|
|
rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr1+112)), \
|
|
|
|
|
rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)), \
|
|
|
|
|
rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)), \
|
|
|
|
|
rgb7 = LOAD_SI128((const __m128i*)(rgb_ptr2+96)), \
|
|
|
|
|
rgb8 = LOAD_SI128((const __m128i*)(rgb_ptr2+112)); \
|
|
|
|
|
/* unpack rgb24 data to r, g and b data in separate channels*/ \
|
|
|
|
|
/* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \
|
|
|
|
|
/* here, because averaging in horizontal direction is easier like this*/ \
|
|
|
|
|
/* The last step is applied further on the Y channel only*/ \
|
|
|
|
|
UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \
|
|
|
|
|
UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \
|
|
|
|
|
UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \
|
|
|
|
|
UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \
|
|
|
|
|
/* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb2_16 = _mm_sub_epi16(b_16, y1_16); \
|
|
|
|
|
cr2_16 = _mm_sub_epi16(r_16, y1_16); \
|
|
|
|
|
r_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpacklo_epi8(rgb7, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr1+16), Y); \
|
|
|
|
|
/* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \
|
|
|
|
|
/* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y1_16 = _mm_srli_epi16(y1_16, 8); \
|
|
|
|
|
cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y1_16)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y1_16)); \
|
|
|
|
|
r_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \
|
|
|
|
|
g_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \
|
|
|
|
|
b_16 = _mm_unpackhi_epi8(rgb7, _mm_setzero_si128()); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \
|
|
|
|
|
_mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \
|
|
|
|
|
y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \
|
|
|
|
|
y2_16 = _mm_srli_epi16(y2_16, 8); \
|
|
|
|
|
cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \
|
|
|
|
|
/* Rescale Y' to Y, pack it to 8bit values and save it */ \
|
|
|
|
|
y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \
|
|
|
|
|
Y = _mm_packus_epi16(y1_16, y2_16); \
|
|
|
|
|
Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \
|
|
|
|
|
SAVE_SI128((__m128i*)(y_ptr2+16), Y); \
|
|
|
|
|
/* Rescale Cb and Cr to their final range */ \
|
|
|
|
|
cb2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb2_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
cr2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr2_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \
|
|
|
|
|
/* Pack and save Cb Cr */ \
|
|
|
|
|
cb = _mm_packus_epi16(cb1_16, cb2_16); \
|
|
|
|
|
cr = _mm_packus_epi16(cr1_16, cr2_16); \
|
|
|
|
|
SAVE_SI128((__m128i*)(u_ptr), cb); \
|
|
|
|
|
SAVE_SI128((__m128i*)(v_ptr), cr);
|
|
|
|
|
|
|
|
|
|
void rgb32_yuv420_sse(uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *RGBA, uint32_t RGBA_stride,
|
|
|
|
|
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_load_si128
|
|
|
|
|
#define SAVE_SI128 _mm_stream_si128
|
|
|
|
|
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride,
|
|
|
|
|
*rgb_ptr2=RGBA+(y+1)*RGBA_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*u_ptr=U+(y/2)*UV_stride,
|
|
|
|
|
*v_ptr=V+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
RGBA2YUV_32
|
|
|
|
|
|
|
|
|
|
rgb_ptr1+=128;
|
|
|
|
|
rgb_ptr2+=128;
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
u_ptr+=16;
|
|
|
|
|
v_ptr+=16;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void rgb32_yuv420_sseu(uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *RGBA, uint32_t RGBA_stride,
|
|
|
|
|
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_loadu_si128
|
|
|
|
|
#define SAVE_SI128 _mm_storeu_si128
|
|
|
|
|
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride,
|
|
|
|
|
*rgb_ptr2=RGBA+(y+1)*RGBA_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*u_ptr=U+(y/2)*UV_stride,
|
|
|
|
|
*v_ptr=V+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
RGBA2YUV_32
|
|
|
|
|
|
|
|
|
|
rgb_ptr1+=128;
|
|
|
|
|
rgb_ptr2+=128;
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
u_ptr+=16;
|
|
|
|
|
v_ptr+=16;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifdef __SSE2__
|
|
|
|
|
|
|
|
|
|
#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
|
|
|
|
|
@@ -965,22 +484,6 @@ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
|
|
|
|
|
__m128i u = LOAD_SI128((const __m128i*)(u_ptr)); \
|
|
|
|
|
__m128i v = LOAD_SI128((const __m128i*)(v_ptr)); \
|
|
|
|
|
|
|
|
|
|
#define LOAD_UV_NV12 \
|
|
|
|
|
__m128i uv1 = LOAD_SI128((const __m128i*)(uv_ptr)); \
|
|
|
|
|
__m128i uv2 = LOAD_SI128((const __m128i*)(uv_ptr+16)); \
|
|
|
|
|
__m128i u = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \
|
|
|
|
|
uv1 = _mm_srli_epi16(uv1, 8); \
|
|
|
|
|
uv2 = _mm_srli_epi16(uv2, 8); \
|
|
|
|
|
__m128i v = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \
|
|
|
|
|
|
|
|
|
|
#define LOAD_UV_NV21 \
|
|
|
|
|
__m128i uv1 = LOAD_SI128((const __m128i*)(uv_ptr)); \
|
|
|
|
|
__m128i uv2 = LOAD_SI128((const __m128i*)(uv_ptr+16)); \
|
|
|
|
|
__m128i v = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \
|
|
|
|
|
uv1 = _mm_srli_epi16(uv1, 8); \
|
|
|
|
|
uv2 = _mm_srli_epi16(uv2, 8); \
|
|
|
|
|
__m128i u = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \
|
|
|
|
|
|
|
|
|
|
#define YUV2RGB_32 \
|
|
|
|
|
__m128i r_tmp, g_tmp, b_tmp; \
|
|
|
|
|
__m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
|
|
|
|
|
@@ -1080,15 +583,6 @@ PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
|
|
|
|
|
LOAD_UV_PLANAR \
|
|
|
|
|
YUV2RGB_32
|
|
|
|
|
|
|
|
|
|
#define YUV2RGB_32_NV12 \
|
|
|
|
|
LOAD_UV_NV12 \
|
|
|
|
|
YUV2RGB_32
|
|
|
|
|
|
|
|
|
|
#define YUV2RGB_32_NV21 \
|
|
|
|
|
LOAD_UV_NV21 \
|
|
|
|
|
YUV2RGB_32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void yuv420_rgb24_sse(
|
|
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
@@ -1163,146 +657,4 @@ void yuv420_rgb24_sseu(
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void nv12_rgb24_sse(
|
|
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
uint8_t *RGB, uint32_t RGB_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_load_si128
|
|
|
|
|
#define SAVE_SI128 _mm_stream_si128
|
|
|
|
|
const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*uv_ptr=UV+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *rgb_ptr1=RGB+y*RGB_stride,
|
|
|
|
|
*rgb_ptr2=RGB+(y+1)*RGB_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
YUV2RGB_32_NV12
|
|
|
|
|
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
uv_ptr+=32;
|
|
|
|
|
rgb_ptr1+=96;
|
|
|
|
|
rgb_ptr2+=96;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void nv12_rgb24_sseu(
|
|
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
uint8_t *RGB, uint32_t RGB_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_loadu_si128
|
|
|
|
|
#define SAVE_SI128 _mm_storeu_si128
|
|
|
|
|
const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*uv_ptr=UV+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *rgb_ptr1=RGB+y*RGB_stride,
|
|
|
|
|
*rgb_ptr2=RGB+(y+1)*RGB_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
YUV2RGB_32_NV12
|
|
|
|
|
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
uv_ptr+=32;
|
|
|
|
|
rgb_ptr1+=96;
|
|
|
|
|
rgb_ptr2+=96;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void nv21_rgb24_sse(
|
|
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
uint8_t *RGB, uint32_t RGB_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_load_si128
|
|
|
|
|
#define SAVE_SI128 _mm_stream_si128
|
|
|
|
|
const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*uv_ptr=UV+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *rgb_ptr1=RGB+y*RGB_stride,
|
|
|
|
|
*rgb_ptr2=RGB+(y+1)*RGB_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
YUV2RGB_32_NV21
|
|
|
|
|
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
uv_ptr+=32;
|
|
|
|
|
rgb_ptr1+=96;
|
|
|
|
|
rgb_ptr2+=96;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void nv21_rgb24_sseu(
|
|
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
|
const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride,
|
|
|
|
|
uint8_t *RGB, uint32_t RGB_stride,
|
|
|
|
|
YCbCrType yuv_type)
|
|
|
|
|
{
|
|
|
|
|
#define LOAD_SI128 _mm_loadu_si128
|
|
|
|
|
#define SAVE_SI128 _mm_storeu_si128
|
|
|
|
|
const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
|
|
|
|
|
|
|
|
|
|
uint32_t x, y;
|
|
|
|
|
for(y=0; y<(height-1); y+=2)
|
|
|
|
|
{
|
|
|
|
|
const uint8_t *y_ptr1=Y+y*Y_stride,
|
|
|
|
|
*y_ptr2=Y+(y+1)*Y_stride,
|
|
|
|
|
*uv_ptr=UV+(y/2)*UV_stride;
|
|
|
|
|
|
|
|
|
|
uint8_t *rgb_ptr1=RGB+y*RGB_stride,
|
|
|
|
|
*rgb_ptr2=RGB+(y+1)*RGB_stride;
|
|
|
|
|
|
|
|
|
|
for(x=0; x<(width-31); x+=32)
|
|
|
|
|
{
|
|
|
|
|
YUV2RGB_32_NV21
|
|
|
|
|
|
|
|
|
|
y_ptr1+=32;
|
|
|
|
|
y_ptr2+=32;
|
|
|
|
|
uv_ptr+=32;
|
|
|
|
|
rgb_ptr1+=96;
|
|
|
|
|
rgb_ptr2+=96;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#undef LOAD_SI128
|
|
|
|
|
#undef SAVE_SI128
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif //__SSE2__
|
|
|
|
|
|