From 40a2184b1b3f34179fd50138105daede56a62f7f Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 9 Apr 2011 01:15:28 +0800 Subject: [PATCH] libpixelflinger: Add ARM NEON optimized scanline_t32cb16 Reference benchmark results on Beagleboard (TI OMAP353x) at 500 MHz: scanline_t32cb16_c memory bandwidth: 31.63 MB/s scanline_t32cb16_neon memory bandwidth: 147.69 MB/s It can dramatically improve the performance of boot animation. Change-Id: Iff6a0fea330fc82d1570909dd57155913ef056ab --- libpixelflinger/Android.mk | 1 + libpixelflinger/scanline.cpp | 6 ++ libpixelflinger/t32cb16_neon.S | 180 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 0 deletions(-) create mode 100644 libpixelflinger/t32cb16_neon.S diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk index 1e93499..bbcfb9d 100644 --- a/libpixelflinger/Android.mk +++ b/libpixelflinger/Android.mk @@ -30,6 +30,7 @@ PIXELFLINGER_SRC_FILES:= \ ifeq ($(TARGET_ARCH),arm) ifeq ($(TARGET_ARCH_VARIANT),armv7-a-neon) +PIXELFLINGER_SRC_FILES += t32cb16_neon.S PIXELFLINGER_SRC_FILES += t32cb16blend.S PIXELFLINGER_SRC_FILES += col32cb16blend_neon.S else diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp index 72acaaf..10c6b78 100644 --- a/libpixelflinger/scanline.cpp +++ b/libpixelflinger/scanline.cpp @@ -93,6 +93,7 @@ static void scanline_clear(context_t* c); static void rect_generic(context_t* c, size_t yc); static void rect_memcpy(context_t* c, size_t yc); +extern "C" void scanline_t32cb16_neon(uint16_t *dst, uint32_t *src, size_t ct); extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t); extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct); extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct); @@ -1320,6 +1321,10 @@ void scanline_t32cb16(context_t* c) int sR, sG, sB; uint32_t s, d; +#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__)) && \ + defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN + scanline_t32cb16_neon(dst, src, ct); +#else if (ct==1 || uint32_t(dst)&2) { last_one: s = GGL_RGBA_TO_HOST( *src++ ); @@ -1354,6 +1359,7 @@ last_one: if (ct > 0) { goto last_one; } +#endif } void scanline_t32cb16blend(context_t* c) diff --git a/libpixelflinger/t32cb16_neon.S b/libpixelflinger/t32cb16_neon.S new file mode 100644 index 0000000..9d72adf --- /dev/null +++ b/libpixelflinger/t32cb16_neon.S @@ -0,0 +1,180 @@ +/* libs/pixelflinger/t32cb16_neon.S + * + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .text + .align + + .global scanline_t32cb16_neon + +// r0: dst ptr +// r1: src ptr +// r2: count + +scanline_t32cb16_neon: + cmp r2, #7 + bhi count_great_than_8 + + // handle count < 8 + mov r3, #0 + vmov.u8 d31, #1<<7 + mov r3, r0 + + tst r2, #4 + beq 14f + vld1.16 {d25}, [r0]! + vld1.32 {q1}, [r1]! + +14: + tst r2, #2 + beq 12f + vld1.32 {d24[1]}, [r0]! + vld1.32 {d1}, [r1]! + +12: + tst r2, #1 + beq 11f + vld1.16 {d24[1]}, [r0]! + vld1.32 {d0[1]}, [r1]! + +11: + // unzip achieve the same as a vld4 operation + vuzpq.u16 q0, q1 + vuzp.u8 d0, d1 + vuzp.u8 d2, d3 + // expand 0565 q12 to 8888 {d4-d7} + vmovn.u16 d4, q12 + vshr.u16 q11, q12, #5 + vshr.u16 q10, q12, #6+5 + vmovn.u16 d5, q11 + vmovn.u16 d6, q10 + vshl.u8 d4, d4, #3 + vshl.u8 d5, d5, #2 + vshl.u8 d6, d6, #3 + + vmovl.u8 q14, d31 + vmovl.u8 q13, d31 + vmovl.u8 q12, d31 + + // duplicate in 4/2/1 & 8pix vsns + vmvn.8 d30, d3 + vmlal.u8 q14, d30, d6 + vmlal.u8 q13, d30, d5 + vmlal.u8 q12, d30, d4 + vshr.u16 q8, q14, #5 + vshr.u16 q9, q13, #6 + vaddhn.u16 d6, q14, q8 + vshr.u16 q8, q12, #5 + vaddhn.u16 d5, q13, q9 + vqadd.u8 d6, d6, d0 // moved up + vaddhn.u16 d4, q12, q8 + // intentionally, don't calculate alpha result in d4-d6 + + vqadd.u8 d5, d5, d1 + vqadd.u8 d4, d4, d2 + + // pack 8888 {d4-d6} to 0565 q10 + vshll.u8 q10, d6, #8 + vshll.u8 q3, d5, #8 + vshll.u8 q2, d4, #8 + vsri.u16 q10, q3, #5 + vsri.u16 q10, q2, #11 + + // store + tst r2, #4 + beq 24f + vst1.16 {d21}, [r3]! + +24: + tst r2, #2 + beq 22f + vst1.32 {d20[1]}, [r3]! + +22: + tst r2, #1 + beq 21f + vst1.16 {d20[1]}, [r3]! + +21: + bx lr + +# count >= 8 +count_great_than_8: + mov r3, #0 + ands ip, r2, #7 + vmov.u8 d31, #1<<7 + vld1.16 {q12}, [r0] + vld4.8 {d0-d3}, [r1] + moveq ip, #8 + mov r3, r0 + + add r1, r1, ip, LSL#2 + add r0, r0, ip, LSL#1 + subs r2, r2, ip + b 9f + +// LOOP +2: + vld1.16 {q12}, [r0]! + vld4.8 {d0-d3}, [r1]! + vst1.16 {q10}, [r3] + sub r3, r0, #8*2 + subs r2, r2, #8 +9: + pld [r0,#32] + // expand 0565 q12 to 8888 {d4-d7} + vmovn.u16 d4, q12 + vshr.u16 q11, q12, #5 + vshr.u16 q10, q12, #6+5 + vmovn.u16 d5, q11 + vmovn.u16 d6, q10 + vshl.u8 d4, d4, #3 + vshl.u8 d5, d5, #2 + vshl.u8 d6, d6, #3 + + // duplicate in 4/2/1 & 8pix vsns + vmovl.u8 q14, d31 + vmovl.u8 q13, d31 + vmovl.u8 q12, d31 + vmvn.8 d30, d3 + vmlal.u8 q14, d30, d6 + vmlal.u8 q13, d30, d5 + vmlal.u8 q12, d30, d4 + vshr.u16 q8, q14, #5 + vshr.u16 q9, q13, #6 + vaddhn.u16 d6, q14, q8 // moved up + vshr.u16 q8, q12, #5 + vaddhn.u16 d5, q13, q9 + // intentionally, don't calculate alpha result in d4-d6 + + vqadd.u8 d6, d6, d0 + vaddhn.u16 d4, q12, q8 + + // pack 8888 {d4-d6} to 0565 q10 + vqadd.u8 d5, d5, d1 + vqadd.u8 d4, d4, d2 + vshll.u8 q10, d6, #8 + vshll.u8 q3, d5, #8 + vshll.u8 q2, d4, #8 + vsri.u16 q10, q3, #5 + vsri.u16 q10, q2, #11 + + bne 2b + +1: + vst1.16 {q10}, [r3] + + bx lr -- 1.7.4.1