From bc0da3a96daa973f8d9cdc7b0beec9ab74e823ac Mon Sep 17 00:00:00 2001 From: Dheeraj CVR Date: Sat, 30 May 2015 23:08:59 +0400 Subject: [PATCH] exynos: multimedia: speed up color conversion from ARGB8888 to YUV420SP Use NEON instructions for color conversion from ARGB8888 to YUV420SP. This greatly improves performance and can help achieve 30fps+ framerates when dealing with OMX_COLOR_FormatAndroidOpaque during Screen Recording and Screen Casting. Change-Id: Ifdaaf03e1ce6909822df3f046ef35dd977b84d17 --- .../component/video/enc/SEC_OMX_Venc.c | 2 +- .../multimedia/utils/csc/exynos4/Android.mk | 1 + .../utils/csc/exynos4/color_space_convertor.h | 7 + .../exynos4/csc_ARGB8888_to_YUV420SP_NEON.s | 365 ++++++++++++++++++ 4 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s diff --git a/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c b/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c index d3f16b9a..e7385071 100644 --- a/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c +++ b/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c @@ -805,7 +805,7 @@ OMX_BOOL SEC_Preprocessor_InputData(OMX_COMPONENTTYPE *pOMXComponent) SEC_OSAL_GetInfoFromMetaData(inputData, ppBuf); SEC_OSAL_LockANBHandle((OMX_U32)ppBuf[0], width, height, OMX_COLOR_FormatAndroidOpaque, &pOutBuffer); - csc_ARGB8888_to_YUV420SP(pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].YVirAddr, + csc_ARGB8888_to_YUV420SP_NEON(pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].YVirAddr, pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].CVirAddr, pOutBuffer, width, height); diff --git a/exynos/multimedia/utils/csc/exynos4/Android.mk b/exynos/multimedia/utils/csc/exynos4/Android.mk index e7ed4e24..8609819b 100644 --- a/exynos/multimedia/utils/csc/exynos4/Android.mk +++ b/exynos/multimedia/utils/csc/exynos4/Android.mk @@ -15,6 +15,7 @@ LOCAL_SRC_FILES := \ csc_linear_to_tiled_interleave_crop_neon.s \ csc_tiled_to_linear_crop_neon.s \ csc_tiled_to_linear_deinterleave_crop_neon.s \ + csc_ARGB8888_to_YUV420SP_NEON.s \ csc_interleave_memcpy_neon.s \ csc_fimc.cpp diff --git a/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h b/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h index 92c0a6d9..1967f481 100644 --- a/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h +++ b/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h @@ -411,4 +411,11 @@ void csc_ARGB8888_to_YUV420SP( unsigned int width, unsigned int height); +void csc_ARGB8888_to_YUV420SP_NEON( + unsigned char *y_dst, + unsigned char *uv_dst, + unsigned char *rgb_src, + unsigned int width, + unsigned int height); + #endif /*COLOR_SPACE_CONVERTOR_H_*/ diff --git a/exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s b/exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s new file mode 100644 index 00000000..62ccf97a --- /dev/null +++ b/exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s @@ -0,0 +1,365 @@ + + .arch armv7-a + .text + .global csc_ARGB8888_to_YUV420SP_NEON + .type csc_ARGB8888_to_YUV420SP_NEON, %function +csc_ARGB8888_to_YUV420SP_NEON: + .fnstart + + @r0 pDstY + @r1 pDstUV + @r2 pSrcRGB + @r3 nWidth + @r4 pDstY2 = pDstY + nWidth + @r5 pSrcRGB2 = pSrcRGB + nWidthx2 + @r6 temp7, nWidth16m + @r7 temp6, accumilator + @r8 temp5, nWidthTemp + @r9 temp4, Raw RGB565 + @r10 temp3, r,g,b + @r11 temp2, immediate operand + @r12 temp1, nHeight + @r14 temp0, debugging pointer + + .equ CACHE_LINE_SIZE, 32 + .equ PRE_LOAD_OFFSET, 6 + + stmfd sp!, {r4-r12,r14} @ backup registers + ldr r12, [sp, #40] @ load nHeight + @ldr r14, [sp, #44] @ load pTest + add r4, r0, r3 @r4: pDstY2 = pDstY + nWidth + add r5, r2, r3, lsl #2 @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4 + sub r8, r3, #16 @r8: nWidthTmp = nWidth -16 + + @q0: temp1, R + @q1: temp2, GB + @q2: R + @q3: G + @q4: B + @q5: temp3, output + + + vmov.u16 q6, #66 @coefficient assignment + vmov.u16 q7, #129 + vmov.u16 q8, #25 + vmov.u16 q9, #0x8080 @ 128<<8 + 128 + + vmov.u16 q10, #0x1000 @ 16<<8 + 128 + vorr.u16 q10, #0x0080 + + vmov.u16 q11, #38 @#-38 + vmov.u16 q12, #74 @#-74 + vmov.u16 q13, #112 + vmov.u16 q14, #94 @#-94 + vmov.u16 q15, #18 @#-18 + + + + +LOOP_NHEIGHT2: + stmfd sp!, {r12} @ backup registers + +LOOP_NWIDTH16: + pld [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] + @-------------------------------------------YUV ------------------------------------------ + vmov.u16 q14, #94 @#94 + vmov.u16 q15, #18 @#18 + vld4.8 {d0,d1,d2,d3}, [r2]! @loadRGB interleavely + vld4.8 {d4,d5,d6,d7}, [r2]! @loadRGB interleavely + + + vmov.u16 d8,d2 + vmov.u16 d9,d6 + vmov.u16 d10,d1 + vmov.u16 d11,d5 + vmov.u16 d12,d0 + vmov.u16 d13,d4 + + vand.u16 q4,#0x00FF @R + vand.u16 q5,#0x00FF @G + vand.u16 q6,#0x00FF @B + + vmov.u16 q8,q9 @ CalcU() + vmla.u16 q8,q6,q13 @112 * B[k] + vmls.u16 q8,q4,q11 @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2 + vmls.u16 q8,q5,q12 @-(74 * G[k]) + vshr.u16 q8,q8, #8 @(128<<8+ 128 + u)>>8 + + vmov.u16 q7,q9 @CalcV() + vmla.u16 q7,q4,q13 @112 * R[k] + vmls.u16 q7,q5,q14 @q0:U -(94 * G[k]) @128<<6+ 32 + v>>2 + vmls.u16 q7,q6,q15 @-(18 * B[k]) + vshr.u16 q7,q7, #8 @(128<<8+ 128 + v)>>8 + + + vtrn.8 q8,q7 + vst1.8 {q8}, [r1]! @write UV component to yuv420_buffer+linear_ylanesiez + + @-------------------------------------------Y ------------------------------------------ + + vmov.u16 q14, #66 @#66 + vmov.u16 q15, #129 @#129 + vmov.u16 q8, #25 @#25 + + @CalcY_Y() + + vmul.u16 q7,q4,q14 @q0 = 66 *R[k] + vmla.u16 q7,q5,q15 @q0 += 129 *G[k] + vmla.u16 q7,q6,q8 @q0 += 25 *B[k] + + vadd.u16 q7,q7,q10 + vshr.u16 q7,q7, #8 + + vmov.u16 d8,d2 + vmov.u16 d9,d6 + vmov.u16 d10,d1 + vmov.u16 d11,d5 + vmov.u16 d12,d0 + vmov.u16 d13,d4 + + vshr.u16 q4,q4,#8 @R + vshr.u16 q5,q5,#8 @G + vshr.u16 q6,q6,#8 @B + + vmul.u16 q0,q4,q14 @q0 = 66 *R[k] + vmla.u16 q0,q5,q15 @q0 += 129 *G[k] + vmla.u16 q0,q6,q8 @q0 += 25 *B[k] + vadd.u16 q0,q0,q10 + vshr.u16 q0,q0, #8 + + vtrn.8 q7,q0 + vst1.8 {q7}, [r0]!@write to Y to yuv420_buffer + + + + @-------------------------------------------Y ------------------------------------------ + + @---------------------------------------------Y1------------------------------------------- + + pld [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] + vld4.8 {d0,d1,d2,d3}, [r5]! @loadRGB interleavely + vld4.8 {d4,d5,d6,d7}, [r5]! @loadRGB interleavely + + vmov.u16 d8,d2 + vmov.u16 d9,d6 + vmov.u16 d10,d1 + vmov.u16 d11,d5 + vmov.u16 d12,d0 + vmov.u16 d13,d4 + + + vand.u16 q4,#0x00FF @R + vand.u16 q5,#0x00FF @G + vand.u16 q6,#0x00FF @B + + + + vmul.u16 q7,q4,q14 @q0 = 66 *R[k] + vmla.u16 q7,q5,q15 @q0 += 129 *G[k] + vmla.u16 q7,q6,q8 @q0 += 25 *B[k] + vadd.u16 q7,q7,q10 + vshr.u16 q7,q7, #8 + + vmov.u16 d8,d2 + vmov.u16 d9,d6 + vmov.u16 d10,d1 + vmov.u16 d11,d5 + vmov.u16 d12,d0 + vmov.u16 d13,d4 + + vshr.u16 q4,q4,#8 @R + vshr.u16 q5,q5,#8 @G + vshr.u16 q6,q6,#8 @B + + vmul.u16 q0,q4,q14 @q0 = 66 *R[k] + vmla.u16 q0,q5,q15 @q0 += 129 *G[k] + vmla.u16 q0,q6,q8 @q0 += 25 *B[k] + vadd.u16 q0,q0,q10 + vshr.u16 q0,q0, #8 + + vtrn.8 q7,q0 + vst1.8 {q7}, [r4]!@write to Y to yuv420_buffer + + subs r8,r8,#16 @nWidth16-- + BPL LOOP_NWIDTH16 @if nWidth16>0 + @-----------------------------------unaligned --------------------------------------- + + adds r8,r8,#16 @ + 16 - 2 + BEQ NO_UNALIGNED @in case that nWidht is multiple of 16 +LOOP_NWIDTH2: + @----------------------------------pDstRGB1--Y------------------------------------------ + @stmfd sp!, {r14} @backup r14 + + + ldr r9, [r2], #4 @loadRGB int + ldr r12, [r2], #4 @loadRGB int + + mov r10, r9,lsr #16 @copy to r10 + mov r14, r12 @copy to r10 + + ldr r6, =0x000000FF + and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10; + ldr r6, =0x00FF0000 + and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10; + add r10,r10,r14 + + mov r11, #66 @accumilator += R*66 + mul r7, r10, r11 + + mov r10, r9,lsr #8 @copy to r10 + mov r14, r12,lsl #8 @copy to r10 + + ldr r6, =0x000000FF + and r10, r10, r6 @G: + ldr r6, =0x00FF0000 + and r14, r14, r6 @G: + add r10,r10,r14 + + mov r11, #129 @accumilator += G *129 + mla r7, r10, r11, r7 + + mov r10, r9 @copy to r10 + mov r14, r12,lsl #16 @copy to r10 + + ldr r6, =0x000000FF + and r10, r10, r6 @B + ldr r6, =0x00FF0000 + and r14, r14, r6 @B + add r10,r10,r14 + + mov r11, #25 @accumilator 1 -= B *25 + mla r7, r10, r11, r7 + + ldr r6, =0x10801080 + add r7, r6 + + lsr r7, #8 + strb r7, [r0],#1 + lsr r7,#16 + strb r7, [r0],#1 + @ldmfd sp!, {r14} @load r14 + + + @----------------------------------pDstRGB2--UV------------------------------------------ + + mov r10, r9 @copy to r10 + ldr r7,=0x00008080 + mov r12,r7 + + ldr r6, =0x000000FF + and r10, r10, r6 @B: + + mov r11, #112 @accumilator += B*112 + mla r7, r10, r11, r7 + + + mov r11, #18 @accumilator -= B*18 + mul r11, r10, r11 + sub r12, r12, r11 + + + + + mov r10, r9, lsr #16 @copy to r10 + ldr r6, =0x000000FF + and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10; + + mov r11, #38 @accumilator -= R *38 + mul r11, r10, r11 + sub r7, r7, r11 + + mov r11, #112 @accumilator = R *112 + mla r12, r10, r11, r12 + + mov r10, r9,lsr #8 @copy to r10 + ldr r6, =0x000000FF + and r10, r10, r6 @G: (rgbIn[k] & 0x07E0) >> 5; + + mov r11, #74 @accumilator -= G*74 + mul r11, r10, r11 + sub r7, r7, r11 + + mov r11, #94 @accumilator -= G*94 + mul r11, r10, r11 + sub r12, r12, r11 + + lsr r7, #8 @ >>8 + strb r7, [r1],#1 + lsr r12, #8 @ >>8 + strb r12, [r1],#1 + + @----------------------------------pDstRGB2--Y------------------------------------------ + @stmfd sp!, {r14} @backup r14 + + + ldr r9, [r5], #4 @loadRGB int + ldr r12, [r5], #4 @loadRGB int + + mov r10, r9,lsr #16 @copy to r10 + mov r14, r12 @copy to r10 + + ldr r6, =0x000000FF + and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10; + ldr r6, =0x00FF0000 + and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10; + add r10,r10,r14 + + mov r11, #66 @accumilator += R*66 + mul r7, r10, r11 + + mov r10, r9,lsr #8 @copy to r10 + mov r14, r12,lsl #8 @copy to r10 + + ldr r6, =0x000000FF + and r10, r10, r6 @G: + ldr r6, =0x00FF0000 + and r14, r14, r6 @G: + add r10,r10,r14 + + mov r11, #129 @accumilator += G *129 + mla r7, r10, r11, r7 + + mov r10, r9 @copy to r10 + mov r14, r12,lsl #16 @copy to r10 + + ldr r6, =0x000000FF + and r10, r10, r6 @B + ldr r6, =0x00FF0000 + and r14, r14, r6 @B + add r10,r10,r14 + + + + + mov r11, #25 @accumilator 1 -= B *25 + mla r7, r10, r11, r7 + + ldr r6, =0x10801080 + add r7, r6 + lsr r7, #8 + + strb r7, [r4],#1 + lsr r7,#16 + strb r7, [r4],#1 + @ldmfd sp!, {r14} @load r14 + + + subs r8,r8,#2 @ nWidth2 -= 2 + BGT LOOP_NWIDTH2 @ if nWidth2>0 + + +NO_UNALIGNED: @in case that nWidht is multiple of 16 + + @----------------------------------------------------------------------------- + sub r8, r3, #16 @r8: nWidthTmp = nWidth -16 + add r0, r0, r3 @pDstY + nwidth + add r2, r2, r3, lsl #2 @pSrcRGB + nwidthx4 + add r4, r4, r3 @pDstY2 + nwidth + add r5, r5, r3, lsl #2 @pSrcRGB2 + nwidthx4 + + ldmfd sp!, {r12} + subs r12,r12,#2 @nHeight -=2 + BGT LOOP_NHEIGHT2 @if nHeight2>0 + + ldmfd sp!, {r4-r12,pc} @ backup registers + .fnend