|
void YV12toYUY2_SSE2_tmp(BYTE *curY, BYTE *curU, BYTE *curV,<br />
BYTE *pDst, BYTE *pTmp, int XSize, int YSize,<br />
int srcPitch, int dstPitch /* bytes wide for YUY2 surface */) {<br />
int row, col;<br />
int XSize_2 = XSize >> 1;<br />
int srcPitch_2 = srcPitch >> 1;<br /><br />
__m128i vzero;<br />
__m128i vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6;<br /><br />
vzero = _mm_setzero_si128();<br /><br />
for (row=0; row < YSize; row += 2) {<br />
// watch for buffer size issues<br />
for (col=0; col < XSize_2; col += 16) {<br />
// Load 16 Y's, row 0<br />
vtmp0 = _mm_load_si128((__m128i*)(curY+2*col));<br />
vtmp1 = _mm_loadl_epi64((__m128i*)(curU+col));<br />
// Load 8 U's<br />
vtmp2 = _mm_loadl_epi64((__m128i*)(curV+col));<br />
// Load 8 V's<br />
vtmp6 = _mm_load_si128((__m128i*)(curY+2*col+srcPitch));<br />
// Load 16 Y's, row 1<br /><br />
vtmp3 = vtmp0;<br />
vtmp0 = _mm_unpacklo_epi8(vtmp0, vzero);<br />
// __Y7__Y6__Y5__Y4__Y3__Y2__Y1__Y0<br />
vtmp1 = _mm_unpacklo_epi8(vtmp1, vzero);<br />
// __U7__U6__U5__U4__U3__U2__U1__U0<br />
vtmp2 = _mm_unpacklo_epi8(vtmp2, vzero);<br />
// __V7__V6__V5__V4__V3__V2__V1__V0<br />
vtmp3 = _mm_unpackhi_epi8(vtmp3, vzero);<br />
// __YF__YE__YD__YC__YB__YA__Y9__Y8<br /><br />
vtmp4 = vtmp1;<br />
vtmp1 = _mm_unpacklo_epi16(vtmp1, vzero);<br />
// ______U3______U2______U1______U0<br />
vtmp5 = vtmp2;<br />
vtmp2 = _mm_unpacklo_epi16(vtmp2, vzero);<br />
// ______V3______V2______V1______V0<br /><br />
vtmp4 = _mm_unpackhi_epi16(vtmp4, vzero);<br />
// ______U7______U6______U5______U4<br />
vtmp5 = _mm_unpackhi_epi16(vtmp5, vzero);<br />
// ______V7______V6______V5______V4<br /><br />
vtmp1 = _mm_slli_epi32(vtmp1, 8);<br />
// ____U3______U2______U1______U0__<br />
vtmp2 = _mm_slli_epi32(vtmp2, 24);<br />
// V3______V2______V1______V0______<br />
vtmp4 = _mm_slli_epi32(vtmp4, 8);<br />
// ____U7______U6______U5______U4__<br />
vtmp5 = _mm_slli_epi32(vtmp5, 24);<br />
// V7______V6______V5______V4______<br /><br />
// All 8 xmm regs used<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp1);<br />
// __Y7U3Y6__Y5U2Y4__Y3U1Y2__Y1U0Y0<br />
vtmp3 = _mm_or_si128(vtmp3, vtmp4);<br />
// __YFU7YE__YDU6YC__YBU5YA__Y9U4Y8<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp2);<br />
// V3Y7U3Y6V2Y5U2Y4V1Y3U1Y2V0Y1U0Y0<br />
vtmp3 = _mm_or_si128(vtmp3, vtmp5);<br />
// V7YFU7YEV6YDU6YCV5YBU5YAV4Y9U4Y8<br /><br />
_mm_stream_si128((__m128i*)(pDst+4*col), vtmp0);<br />
// store first 8 pixels of row 0<br />
vtmp0 = vtmp6;<br />
_mm_stream_si128((__m128i*)(pDst+4*col+16), vtmp3);<br />
// store second 8 pixels of row 0<br /><br />
vtmp6 = _mm_unpacklo_epi8(vtmp6, vzero);<br />
// __Y7__Y6__Y5__Y4__Y3__Y2__Y1__Y0, row 1<br />
vtmp0 = _mm_unpackhi_epi8(vtmp0, vzero);<br />
// __YF__YE__YD__YC__YB__YA__Y9__Y8, row 1<br /><br />
vtmp6 = _mm_or_si128(vtmp6, vtmp1);<br />
// __Y7U3Y6__Y5U2Y4__Y3U1Y2__Y1U0Y0<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp4);<br />
// __YFU7YE__YDU6YC__YBU5YA__Y9U4Y8<br />
vtmp6 = _mm_or_si128(vtmp6, vtmp2);<br />
// V3Y7U3Y6V2Y5U2Y4V1Y3U1Y2V0Y1U0Y0<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp5);<br />
// V7YFU7YEV6YDU6YCV5YBU5YAV4Y9U4Y8<br /><br />
// store first 8 pixels of row 1<br />
_mm_store_si128((__m128i*)(pTmp+4*col), vtmp6);<br />
// store second 8 pixels of row 1<br />
_mm_store_si128((__m128i*)(pTmp+4*col+16), vtmp0);<br /><br />
// ------------ Second set ---------------<br />
vtmp0 = _mm_load_si128((__m128i*)(curY+2*col+16));<br />
// Load 16 Y's, row 0<br />
vtmp1 = _mm_loadl_epi64((__m128i*)(curU+col+8));<br />
// Load 8 U's<br />
vtmp2 = _mm_loadl_epi64((__m128i*)(curV+col+8));<br />
// Load 8 V's<br />
vtmp6 = _mm_load_si128((__m128i*)(curY+2*col+srcPitch+16));<br />
// Load 16 Y's, row 1<br /><br />
vtmp3 = vtmp0;<br />
vtmp0 = _mm_unpacklo_epi8(vtmp0, vzero);<br />
// __Y7__Y6__Y5__Y4__Y3__Y2__Y1__Y0<br />
vtmp1 = _mm_unpacklo_epi8(vtmp1, vzero);<br />
// __U7__U6__U5__U4__U3__U2__U1__U0<br />
vtmp2 = _mm_unpacklo_epi8(vtmp2, vzero);<br />
// __V7__V6__V5__V4__V3__V2__V1__V0<br />
vtmp3 = _mm_unpackhi_epi8(vtmp3, vzero);<br />
// __YF__YE__YD__YC__YB__YA__Y9__Y8<br /><br />
vtmp4 = vtmp1;<br />
vtmp1 = _mm_unpacklo_epi16(vtmp1, vzero);<br />
// ______U3______U2______U1______U0<br />
vtmp5 = vtmp2;<br />
vtmp2 = _mm_unpacklo_epi16(vtmp2, vzero);<br />
// ______V3______V2______V1______V0<br /><br />
vtmp4 = _mm_unpackhi_epi16(vtmp4, vzero);<br />
// ______U7______U6______U5______U4<br />
vtmp5 = _mm_unpackhi_epi16(vtmp5, vzero);<br />
// ______V7______V6______V5______V4<br /><br />
vtmp1 = _mm_slli_epi32(vtmp1, 8);<br />
// ____U3______U2______U1______U0__<br />
vtmp2 = _mm_slli_epi32(vtmp2, 24);<br />
// V3______V2______V1______V0______<br />
vtmp4 = _mm_slli_epi32(vtmp4, 8);<br />
// ____U7______U6______U5______U4__<br />
vtmp5 = _mm_slli_epi32(vtmp5, 24);<br />
// V7______V6______V5______V4______<br /><br />
// All 8 xmm regs used<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp1);<br />
// __Y7U3Y6__Y5U2Y4__Y3U1Y2__Y1U0Y0<br />
vtmp3 = _mm_or_si128(vtmp3, vtmp4);<br />
// __YFU7YE__YDU6YC__YBU5YA__Y9U4Y8<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp2);<br />
// V3Y7U3Y6V2Y5U2Y4V1Y3U1Y2V0Y1U0Y0<br />
vtmp3 = _mm_or_si128(vtmp3, vtmp5);<br />
// V7YFU7YEV6YDU6YCV5YBU5YAV4Y9U4Y8<br /><br />
_mm_stream_si128((__m128i*)(pDst+4*col+32), vtmp0);<br />
// store first 8 pixels of row 0<br />
vtmp0 = vtmp6;<br />
_mm_stream_si128((__m128i*)(pDst+4*col+48), vtmp3);<br />
// store second 8 pixels of row 0<br /><br />
vtmp6 = _mm_unpacklo_epi8(vtmp6, vzero);<br />
// __Y7__Y6__Y5__Y4__Y3__Y2__Y1__Y0, row 1<br />
vtmp0 = _mm_unpackhi_epi8(vtmp0, vzero);<br />
// __YF__YE__YD__YC__YB__YA__Y9__Y8, row 1<br /><br />
vtmp6 = _mm_or_si128(vtmp6, vtmp1);<br />
// __Y7U3Y6__Y5U2Y4__Y3U1Y2__Y1U0Y0<br />
vtmp0 = _mm_or_si128(vtmp0, vtmp4);<br /> |