1/* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26/* 27 * Copyright (c) 2018 RISC OS Open Ltd 28 * 29 * This software is provided 'as-is', without any express or implied 30 * warranty. In no event will the authors be held liable for any damages 31 * arising from the use of this software. 32 * 33 * Permission is granted to anyone to use this software for any purpose, 34 * including commercial applications, and to alter it and redistribute it 35 * freely, subject to the following restrictions: 36 * 37 * 1. The origin of this software must not be misrepresented; you must not 38 * claim that you wrote the original software. If you use this software 39 * in a product, an acknowledgment in the product documentation would be 40 * appreciated but is not required. 41 * 2. Altered source versions must be plainly marked as such, and must not be 42 * misrepresented as being the original software. 43 * 3. This notice may not be removed or altered from any source distribution. 44 */ 45 46/* Prevent the stack from becoming executable for no reason... */ 47#if defined(__linux__) && defined(__ELF__) 48.section .note.GNU-stack,"",%progbits 49#endif 50 51 .text 52 .fpu neon 53 .arch armv7a 54 .object_arch armv4 55 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 56 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 57 .arm 58 .altmacro 59 .p2align 2 60 61#include "pixman-arm-asm.h" 62#include "pixman-arm-neon-asm.h" 63 64/* Global configuration options and preferences */ 65 66/* 67 * The code can optionally make use of unaligned memory accesses to improve 68 * performance of handling leading/trailing pixels for each scanline. 69 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 70 * example in linux if unaligned memory accesses are not configured to 71 * generate.exceptions. 72 */ 73.set RESPECT_STRICT_ALIGNMENT, 1 74 75/* 76 * Set default prefetch type. There is a choice between the following options: 77 * 78 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 79 * as NOP to workaround some HW bugs or for whatever other reason) 80 * 81 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 82 * advanced prefetch intruduces heavy overhead) 83 * 84 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 85 * which can run ARM and NEON instructions simultaneously so that extra ARM 86 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 87 * 88 * Note: some types of function can't support advanced prefetch and fallback 89 * to simple one (those which handle 24bpp pixels) 90 */ 91.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 92 93/* Prefetch distance in pixels for simple prefetch */ 94.set PREFETCH_DISTANCE_SIMPLE, 64 95 96/******************************************************************************/ 97 98/* We can actually do significantly better than the Pixman macros, at least for 99 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53 100 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache). 101 */ 102 103.macro generate_fillrect_function name, bpp, log2Bpp 104/* 105 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); 106 * On entry: 107 * a1 = width, pixels 108 * a2 = height, rows 109 * a3 = pointer to top-left destination pixel 110 * a4 = stride, pixels 111 * [sp] = pixel value to fill with 112 * Within the function: 113 * v1 = width remaining 114 * v2 = vst offset 115 * v3 = alternate pointer 116 * ip = data ARM register 117 */ 118pixman_asm_function name 119 vld1.\bpp {d0[],d1[]}, [sp] 120 sub a4, a1 121 vld1.\bpp {d2[],d3[]}, [sp] 122 cmp a1, #(15+64) >> \log2Bpp 123 push {v1-v3,lr} 124 vmov ip, s0 125 blo 51f 126 127 /* Long-row case */ 128 mov v2, #64 1291: mov v1, a1 130 ands v3, a3, #15 131 beq 2f 132 /* Leading pixels */ 133 rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */ 134 sub v1, v1, v3, lsr #\log2Bpp 135 rbit v3, v3 136.if bpp <= 16 137.if bpp == 8 138 tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */ 139 strneb ip, [a3], #1 140 tst v3, #1<<30 141.else 142 tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */ 143.endif 144 strneh ip, [a3], #2 145.endif 146 movs v3, v3, lsl #3 147 vstmcs a3!, {s0} 148 vstmmi a3!, {d0} 1492: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */ 150 add v3, a3, #32 151 /* Inner loop */ 1523: vst1.\bpp {q0-q1}, [a3 :128], v2 153 subs v1, v1, #64 >> \log2Bpp 154 vst1.\bpp {q0-q1}, [v3 :128], v2 155 bhs 3b 156 /* Trailing pixels */ 1574: movs v1, v1, lsl #27 + \log2Bpp 158 bcc 5f 159 vst1.\bpp {q0-q1}, [a3 :128]! 1605: bpl 6f 161 vst1.\bpp {q0}, [a3 :128]! 1626: movs v1, v1, lsl #2 163 vstmcs a3!, {d0} 164 vstmmi a3!, {s0} 165.if bpp <= 16 166 movs v1, v1, lsl #2 167 strcsh ip, [a3], #2 168.if bpp == 8 169 strmib ip, [a3], #1 170.endif 171.endif 172 subs a2, a2, #1 173 add a3, a3, a4, lsl #\log2Bpp 174 bhi 1b 175 pop {v1-v3,pc} 176 177 /* Short-row case */ 17851: movs v1, a1 179.if bpp == 8 180 tst a3, #3 181 beq 53f 18252: subs v1, v1, #1 183 blo 57f 184 strb ip, [a3], #1 185 tst a3, #3 186 bne 52b 187.elseif bpp == 16 188 tstne a3, #2 189 subne v1, v1, #1 190 strneh ip, [a3], #2 191.endif 19253: cmp v1, #32 >> \log2Bpp 193 bcc 54f 194 vst1.\bpp {q0-q1}, [a3]! 195 sub v1, v1, #32 >> \log2Bpp 196 /* Trailing pixels */ 19754: movs v1, v1, lsl #27 + \log2Bpp 198 bcc 55f 199 vst1.\bpp {q0-q1}, [a3]! 20055: bpl 56f 201 vst1.\bpp {q0}, [a3]! 20256: movs v1, v1, lsl #2 203 vstmcs a3!, {d0} 204 vstmmi a3!, {s0} 205.if bpp <= 16 206 movs v1, v1, lsl #2 207 strcsh ip, [a3], #2 208.if bpp == 8 209 strmib ip, [a3], #1 210.endif 211.endif 212 subs a2, a2, #1 213 add a3, a3, a4, lsl #\log2Bpp 214 bhi 51b 21557: pop {v1-v3,pc} 216 217.endfunc 218.endm 219 220generate_fillrect_function FillRect32ARMNEONAsm, 32, 2 221generate_fillrect_function FillRect16ARMNEONAsm, 16, 1 222generate_fillrect_function FillRect8ARMNEONAsm, 8, 0 223 224/******************************************************************************/ 225 226.macro RGBtoRGBPixelAlpha_process_pixblock_head 227 vmvn d30, d3 /* get inverted source alpha */ 228 vmov d31, d7 /* dest alpha is always unchanged */ 229 vmull.u8 q14, d0, d3 230 vmlal.u8 q14, d4, d30 231 vmull.u8 q0, d1, d3 232 vmlal.u8 q0, d5, d30 233 vmull.u8 q1, d2, d3 234 vmlal.u8 q1, d6, d30 235 vrshr.u16 q2, q14, #8 236 vrshr.u16 q3, q0, #8 237 vraddhn.u16 d28, q14, q2 238 vrshr.u16 q2, q1, #8 239 vraddhn.u16 d29, q0, q3 240 vraddhn.u16 d30, q1, q2 241.endm 242 243.macro RGBtoRGBPixelAlpha_process_pixblock_tail 244 /* nothing */ 245.endm 246 247.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head 248 vld4.8 {d0-d3}, [SRC]! 249 PF add PF_X, PF_X, #8 250 vst4.8 {d28-d31}, [DST_W :128]! 251 PF tst PF_CTL, #0xF 252 vld4.8 {d4-d7}, [DST_R :128]! 253 PF addne PF_X, PF_X, #8 254 vmvn d30, d3 /* get inverted source alpha */ 255 vmov d31, d7 /* dest alpha is always unchanged */ 256 vmull.u8 q14, d0, d3 257 PF subne PF_CTL, PF_CTL, #1 258 vmlal.u8 q14, d4, d30 259 PF cmp PF_X, ORIG_W 260 vmull.u8 q0, d1, d3 261 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 262 vmlal.u8 q0, d5, d30 263 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 264 vmull.u8 q1, d2, d3 265 PF subge PF_X, PF_X, ORIG_W 266 vmlal.u8 q1, d6, d30 267 PF subges PF_CTL, PF_CTL, #0x10 268 vrshr.u16 q2, q14, #8 269 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 270 vrshr.u16 q3, q0, #8 271 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 272 vraddhn.u16 d28, q14, q2 273 vrshr.u16 q2, q1, #8 274 vraddhn.u16 d29, q0, q3 275 vraddhn.u16 d30, q1, q2 276.endm 277 278generate_composite_function \ 279 BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \ 280 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 281 8, /* number of pixels, processed in a single block */ \ 282 5, /* prefetch distance */ \ 283 default_init, \ 284 default_cleanup, \ 285 RGBtoRGBPixelAlpha_process_pixblock_head, \ 286 RGBtoRGBPixelAlpha_process_pixblock_tail, \ 287 RGBtoRGBPixelAlpha_process_pixblock_tail_head 288 289 /******************************************************************************/ 290 291.macro ARGBto565PixelAlpha_process_pixblock_head 292 vmvn d6, d3 293 vshr.u8 d1, #2 294 vshr.u8 d3, #3 295 vshr.u8 d0, #3 296 vshrn.u16 d7, q2, #3 297 vshrn.u16 d25, q2, #8 298 vbic.i16 q2, #0xe0 299 vshr.u8 d6, #3 300 vshr.u8 d7, #2 301 vshr.u8 d2, #3 302 vmovn.u16 d24, q2 303 vshr.u8 d25, #3 304 vmull.u8 q13, d1, d3 305 vmlal.u8 q13, d7, d6 306 vmull.u8 q14, d0, d3 307 vmlal.u8 q14, d24, d6 308 vmull.u8 q15, d2, d3 309 vmlal.u8 q15, d25, d6 310.endm 311 312.macro ARGBto565PixelAlpha_process_pixblock_tail 313 vsra.u16 q13, #5 314 vsra.u16 q14, #5 315 vsra.u16 q15, #5 316 vrshr.u16 q13, #5 317 vrshr.u16 q14, #5 318 vrshr.u16 q15, #5 319 vsli.u16 q14, q13, #5 320 vsli.u16 q14, q15, #11 321.endm 322 323.macro ARGBto565PixelAlpha_process_pixblock_tail_head 324 vld4.8 {d0-d3}, [SRC]! 325 PF add PF_X, PF_X, #8 326 vsra.u16 q13, #5 327 PF tst PF_CTL, #0xF 328 vsra.u16 q14, #5 329 PF addne PF_X, PF_X, #8 330 vsra.u16 q15, #5 331 PF subne PF_CTL, PF_CTL, #1 332 vrshr.u16 q13, #5 333 PF cmp PF_X, ORIG_W 334 vrshr.u16 q14, #5 335 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 336 vrshr.u16 q15, #5 337 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 338 vld1.8 {d4-d5}, [DST_R]! 339 PF subge PF_X, PF_X, ORIG_W 340 vsli.u16 q14, q13, #5 341 PF subges PF_CTL, PF_CTL, #0x10 342 vsli.u16 q14, q15, #11 343 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 344 vst1.8 {q14}, [DST_W :128]! 345 vmvn d6, d3 346 vshr.u8 d1, #2 347 vshr.u8 d3, #3 348 vshr.u8 d0, #3 349 vshrn.u16 d7, q2, #3 350 vshrn.u16 d25, q2, #8 351 vbic.i16 q2, #0xe0 352 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 353 vshr.u8 d6, #3 354 vshr.u8 d7, #2 355 vshr.u8 d2, #3 356 vmovn.u16 d24, q2 357 vshr.u8 d25, #3 358 vmull.u8 q13, d1, d3 359 vmlal.u8 q13, d7, d6 360 vmull.u8 q14, d0, d3 361 vmlal.u8 q14, d24, d6 362 vmull.u8 q15, d2, d3 363 vmlal.u8 q15, d25, d6 364.endm 365 366generate_composite_function \ 367 BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \ 368 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 369 8, /* number of pixels, processed in a single block */ \ 370 6, /* prefetch distance */ \ 371 default_init, \ 372 default_cleanup, \ 373 ARGBto565PixelAlpha_process_pixblock_head, \ 374 ARGBto565PixelAlpha_process_pixblock_tail, \ 375 ARGBto565PixelAlpha_process_pixblock_tail_head 376