1 /*
2   Simple DirectMedia Layer
3   Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
4 
5   This software is provided 'as-is', without any express or implied
6   warranty.  In no event will the authors be held liable for any damages
7   arising from the use of this software.
8 
9   Permission is granted to anyone to use this software for any purpose,
10   including commercial applications, and to alter it and redistribute it
11   freely, subject to the following restrictions:
12 
13   1. The origin of this software must not be misrepresented; you must not
14      claim that you wrote the original software. If you use this software
15      in a product, an acknowledgment in the product documentation would be
16      appreciated but is not required.
17   2. Altered source versions must be plainly marked as such, and must not be
18      misrepresented as being the original software.
19   3. This notice may not be removed or altered from any source distribution.
20 */
21 #include "../SDL_internal.h"
22 
23 #if SDL_HAVE_BLIT_A
24 
25 #include "SDL_video.h"
26 #include "SDL_blit.h"
27 
28 /* Functions to perform alpha blended blitting */
29 
30 /* N->1 blending with per-surface alpha */
31 static void
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)32 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
33 {
34     int width = info->dst_w;
35     int height = info->dst_h;
36     Uint8 *src = info->src;
37     int srcskip = info->src_skip;
38     Uint8 *dst = info->dst;
39     int dstskip = info->dst_skip;
40     Uint8 *palmap = info->table;
41     SDL_PixelFormat *srcfmt = info->src_fmt;
42     SDL_PixelFormat *dstfmt = info->dst_fmt;
43     int srcbpp = srcfmt->BytesPerPixel;
44     Uint32 Pixel;
45     unsigned sR, sG, sB;
46     unsigned dR, dG, dB;
47     const unsigned A = info->a;
48 
49     while (height--) {
50         /* *INDENT-OFF* */
51         DUFFS_LOOP4(
52         {
53         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
54         dR = dstfmt->palette->colors[*dst].r;
55         dG = dstfmt->palette->colors[*dst].g;
56         dB = dstfmt->palette->colors[*dst].b;
57         ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
58         dR &= 0xff;
59         dG &= 0xff;
60         dB &= 0xff;
61         /* Pack RGB into 8bit pixel */
62         if ( palmap == NULL ) {
63             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
64         } else {
65             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
66         }
67         dst++;
68         src += srcbpp;
69         },
70         width);
71         /* *INDENT-ON* */
72         src += srcskip;
73         dst += dstskip;
74     }
75 }
76 
77 /* N->1 blending with pixel alpha */
78 static void
BlitNto1PixelAlpha(SDL_BlitInfo * info)79 BlitNto1PixelAlpha(SDL_BlitInfo * info)
80 {
81     int width = info->dst_w;
82     int height = info->dst_h;
83     Uint8 *src = info->src;
84     int srcskip = info->src_skip;
85     Uint8 *dst = info->dst;
86     int dstskip = info->dst_skip;
87     Uint8 *palmap = info->table;
88     SDL_PixelFormat *srcfmt = info->src_fmt;
89     SDL_PixelFormat *dstfmt = info->dst_fmt;
90     int srcbpp = srcfmt->BytesPerPixel;
91     Uint32 Pixel;
92     unsigned sR, sG, sB, sA;
93     unsigned dR, dG, dB;
94 
95     while (height--) {
96         /* *INDENT-OFF* */
97         DUFFS_LOOP4(
98         {
99         DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
100         dR = dstfmt->palette->colors[*dst].r;
101         dG = dstfmt->palette->colors[*dst].g;
102         dB = dstfmt->palette->colors[*dst].b;
103         ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
104         dR &= 0xff;
105         dG &= 0xff;
106         dB &= 0xff;
107         /* Pack RGB into 8bit pixel */
108         if ( palmap == NULL ) {
109             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
110         } else {
111             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
112         }
113         dst++;
114         src += srcbpp;
115         },
116         width);
117         /* *INDENT-ON* */
118         src += srcskip;
119         dst += dstskip;
120     }
121 }
122 
123 /* colorkeyed N->1 blending with per-surface alpha */
124 static void
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)125 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
126 {
127     int width = info->dst_w;
128     int height = info->dst_h;
129     Uint8 *src = info->src;
130     int srcskip = info->src_skip;
131     Uint8 *dst = info->dst;
132     int dstskip = info->dst_skip;
133     Uint8 *palmap = info->table;
134     SDL_PixelFormat *srcfmt = info->src_fmt;
135     SDL_PixelFormat *dstfmt = info->dst_fmt;
136     int srcbpp = srcfmt->BytesPerPixel;
137     Uint32 ckey = info->colorkey;
138     Uint32 Pixel;
139     unsigned sR, sG, sB;
140     unsigned dR, dG, dB;
141     const unsigned A = info->a;
142 
143     while (height--) {
144         /* *INDENT-OFF* */
145         DUFFS_LOOP(
146         {
147         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
148         if ( Pixel != ckey ) {
149             dR = dstfmt->palette->colors[*dst].r;
150             dG = dstfmt->palette->colors[*dst].g;
151             dB = dstfmt->palette->colors[*dst].b;
152             ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
153             dR &= 0xff;
154             dG &= 0xff;
155             dB &= 0xff;
156             /* Pack RGB into 8bit pixel */
157             if ( palmap == NULL ) {
158                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
159             } else {
160                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
161             }
162         }
163         dst++;
164         src += srcbpp;
165         },
166         width);
167         /* *INDENT-ON* */
168         src += srcskip;
169         dst += dstskip;
170     }
171 }
172 
173 #ifdef __MMX__
174 
175 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
176 static void
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)177 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
178 {
179     int width = info->dst_w;
180     int height = info->dst_h;
181     Uint32 *srcp = (Uint32 *) info->src;
182     int srcskip = info->src_skip >> 2;
183     Uint32 *dstp = (Uint32 *) info->dst;
184     int dstskip = info->dst_skip >> 2;
185     Uint32 dalpha = info->dst_fmt->Amask;
186 
187     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
188 
189     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
190     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
191     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
192 
193     while (height--) {
194         int n = width;
195         if (n & 1) {
196             Uint32 s = *srcp++;
197             Uint32 d = *dstp;
198             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
199                        + (s & d & 0x00010101)) | dalpha;
200             n--;
201         }
202 
203         for (n >>= 1; n > 0; --n) {
204             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
205             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
206 
207             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
208             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
209 
210             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
211             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
212             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
213             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
214 
215             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
216             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
217             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
218             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
219 
220             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
221             dstp += 2;
222             srcp += 2;
223         }
224 
225         srcp += srcskip;
226         dstp += dstskip;
227     }
228     _mm_empty();
229 }
230 
231 /* fast RGB888->(A)RGB888 blending with surface alpha */
232 static void
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)233 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
234 {
235     SDL_PixelFormat *df = info->dst_fmt;
236     Uint32 chanmask;
237     unsigned alpha = info->a;
238 
239     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
240         /* only call a128 version when R,G,B occupy lower bits */
241         BlitRGBtoRGBSurfaceAlpha128MMX(info);
242     } else {
243         int width = info->dst_w;
244         int height = info->dst_h;
245         Uint32 *srcp = (Uint32 *) info->src;
246         int srcskip = info->src_skip >> 2;
247         Uint32 *dstp = (Uint32 *) info->dst;
248         int dstskip = info->dst_skip >> 2;
249         Uint32 dalpha = df->Amask;
250         Uint32 amult;
251 
252         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
253 
254         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
255         /* form the alpha mult */
256         amult = alpha | (alpha << 8);
257         amult = amult | (amult << 16);
258         chanmask =
259             (0xff << df->Rshift) | (0xff << df->
260                                     Gshift) | (0xff << df->Bshift);
261         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
262         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
263         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
264         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
265 
266         while (height--) {
267             int n = width;
268             if (n & 1) {
269                 /* One Pixel Blend */
270                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
271                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
272 
273                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
274                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
275 
276                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
277                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
278                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
279                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
280 
281                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
282                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
283                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
284 
285                 ++srcp;
286                 ++dstp;
287 
288                 n--;
289             }
290 
291             for (n >>= 1; n > 0; --n) {
292                 /* Two Pixels Blend */
293                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
294                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
295                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
296                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
297 
298                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
299                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
300                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
301                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
302 
303                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
304                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
305                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
306                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
307 
308                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
309                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
310                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
311                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
312 
313                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
314                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
315 
316                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
317 
318                 srcp += 2;
319                 dstp += 2;
320             }
321             srcp += srcskip;
322             dstp += dstskip;
323         }
324         _mm_empty();
325     }
326 }
327 
328 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
329 static void
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)330 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
331 {
332     int width = info->dst_w;
333     int height = info->dst_h;
334     Uint32 *srcp = (Uint32 *) info->src;
335     int srcskip = info->src_skip >> 2;
336     Uint32 *dstp = (Uint32 *) info->dst;
337     int dstskip = info->dst_skip >> 2;
338     SDL_PixelFormat *sf = info->src_fmt;
339     Uint32 amask = sf->Amask;
340     Uint32 ashift = sf->Ashift;
341     Uint64 multmask, multmask2;
342 
343     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
344 
345     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
346     multmask = 0x00FF;
347     multmask <<= (ashift * 2);
348     multmask2 = 0x00FF00FF00FF00FFULL;
349 
350     while (height--) {
351         /* *INDENT-OFF* */
352         DUFFS_LOOP4({
353         Uint32 alpha = *srcp & amask;
354         if (alpha == 0) {
355             /* do nothing */
356         } else if (alpha == amask) {
357             *dstp = *srcp;
358         } else {
359             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
360             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
361 
362             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
363             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
364 
365             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
366             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
367             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
368             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
369             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
370             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
371 
372             /* blend */
373             src1 = _mm_mullo_pi16(src1, mm_alpha);
374             src1 = _mm_srli_pi16(src1, 8);
375             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
376             dst1 = _mm_srli_pi16(dst1, 8);
377             dst1 = _mm_add_pi16(src1, dst1);
378             dst1 = _mm_packs_pu16(dst1, mm_zero);
379 
380             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
381         }
382         ++srcp;
383         ++dstp;
384         }, width);
385         /* *INDENT-ON* */
386         srcp += srcskip;
387         dstp += dstskip;
388     }
389     _mm_empty();
390 }
391 
392 #endif /* __MMX__ */
393 
394 #if SDL_ARM_SIMD_BLITTERS
395 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
396 
397 static void
BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)398 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
399 {
400 	int32_t width = info->dst_w;
401 	int32_t height = info->dst_h;
402 	uint16_t *dstp = (uint16_t *)info->dst;
403 	int32_t dststride = width + (info->dst_skip >> 1);
404 	uint32_t *srcp = (uint32_t *)info->src;
405 	int32_t srcstride = width + (info->src_skip >> 2);
406 
407 	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
408 }
409 
410 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
411 
412 static void
BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)413 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
414 {
415     int32_t width = info->dst_w;
416     int32_t height = info->dst_h;
417     uint32_t *dstp = (uint32_t *)info->dst;
418     int32_t dststride = width + (info->dst_skip >> 2);
419     uint32_t *srcp = (uint32_t *)info->src;
420     int32_t srcstride = width + (info->src_skip >> 2);
421 
422     BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
423 }
424 #endif
425 
426 #if SDL_ARM_NEON_BLITTERS
427 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
428 
429 static void
BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)430 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
431 {
432     int32_t width = info->dst_w;
433     int32_t height = info->dst_h;
434     uint16_t *dstp = (uint16_t *)info->dst;
435     int32_t dststride = width + (info->dst_skip >> 1);
436     uint32_t *srcp = (uint32_t *)info->src;
437     int32_t srcstride = width + (info->src_skip >> 2);
438 
439     BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
440 }
441 
442 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
443 
444 static void
BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)445 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
446 {
447 	int32_t width = info->dst_w;
448 	int32_t height = info->dst_h;
449 	uint32_t *dstp = (uint32_t *)info->dst;
450 	int32_t dststride = width + (info->dst_skip >> 2);
451 	uint32_t *srcp = (uint32_t *)info->src;
452 	int32_t srcstride = width + (info->src_skip >> 2);
453 
454 	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
455 }
456 #endif
457 
458 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
459 static void
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)460 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
461 {
462     int width = info->dst_w;
463     int height = info->dst_h;
464     Uint32 *srcp = (Uint32 *) info->src;
465     int srcskip = info->src_skip >> 2;
466     Uint32 *dstp = (Uint32 *) info->dst;
467     int dstskip = info->dst_skip >> 2;
468 
469     while (height--) {
470         /* *INDENT-OFF* */
471         DUFFS_LOOP4({
472             Uint32 s = *srcp++;
473             Uint32 d = *dstp;
474             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
475                    + (s & d & 0x00010101)) | 0xff000000;
476         }, width);
477         /* *INDENT-ON* */
478         srcp += srcskip;
479         dstp += dstskip;
480     }
481 }
482 
483 /* fast RGB888->(A)RGB888 blending with surface alpha */
484 static void
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)485 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
486 {
487     unsigned alpha = info->a;
488     if (alpha == 128) {
489         BlitRGBtoRGBSurfaceAlpha128(info);
490     } else {
491         int width = info->dst_w;
492         int height = info->dst_h;
493         Uint32 *srcp = (Uint32 *) info->src;
494         int srcskip = info->src_skip >> 2;
495         Uint32 *dstp = (Uint32 *) info->dst;
496         int dstskip = info->dst_skip >> 2;
497         Uint32 s;
498         Uint32 d;
499         Uint32 s1;
500         Uint32 d1;
501 
502         while (height--) {
503             /* *INDENT-OFF* */
504             DUFFS_LOOP4({
505                 s = *srcp;
506                 d = *dstp;
507                 s1 = s & 0xff00ff;
508                 d1 = d & 0xff00ff;
509                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
510                      & 0xff00ff;
511                 s &= 0xff00;
512                 d &= 0xff00;
513                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
514                 *dstp = d1 | d | 0xff000000;
515                 ++srcp;
516                 ++dstp;
517             }, width);
518             /* *INDENT-ON* */
519             srcp += srcskip;
520             dstp += dstskip;
521         }
522     }
523 }
524 
525 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
526 static void
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)527 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
528 {
529     int width = info->dst_w;
530     int height = info->dst_h;
531     Uint32 *srcp = (Uint32 *) info->src;
532     int srcskip = info->src_skip >> 2;
533     Uint32 *dstp = (Uint32 *) info->dst;
534     int dstskip = info->dst_skip >> 2;
535 
536     while (height--) {
537         /* *INDENT-OFF* */
538         DUFFS_LOOP4({
539         Uint32 dalpha;
540         Uint32 d;
541         Uint32 s1;
542         Uint32 d1;
543         Uint32 s = *srcp;
544         Uint32 alpha = s >> 24;
545         /* FIXME: Here we special-case opaque alpha since the
546            compositioning used (>>8 instead of /255) doesn't handle
547            it correctly. Also special-case alpha=0 for speed?
548            Benchmark this! */
549         if (alpha) {
550           if (alpha == SDL_ALPHA_OPAQUE) {
551               *dstp = *srcp;
552           } else {
553             /*
554              * take out the middle component (green), and process
555              * the other two in parallel. One multiply less.
556              */
557             d = *dstp;
558             dalpha = d >> 24;
559             s1 = s & 0xff00ff;
560             d1 = d & 0xff00ff;
561             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
562             s &= 0xff00;
563             d &= 0xff00;
564             d = (d + ((s - d) * alpha >> 8)) & 0xff00;
565             dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
566             *dstp = d1 | d | (dalpha << 24);
567           }
568         }
569         ++srcp;
570         ++dstp;
571         }, width);
572         /* *INDENT-ON* */
573         srcp += srcskip;
574         dstp += dstskip;
575     }
576 }
577 
578 #ifdef __3dNOW__
579 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
580 static void
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)581 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
582 {
583     int width = info->dst_w;
584     int height = info->dst_h;
585     Uint32 *srcp = (Uint32 *) info->src;
586     int srcskip = info->src_skip >> 2;
587     Uint32 *dstp = (Uint32 *) info->dst;
588     int dstskip = info->dst_skip >> 2;
589     SDL_PixelFormat *sf = info->src_fmt;
590     Uint32 amask = sf->Amask;
591     Uint32 ashift = sf->Ashift;
592     Uint64 multmask, multmask2;
593 
594     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
595 
596     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
597     multmask = 0x00FF;
598     multmask <<= (ashift * 2);
599     multmask2 = 0x00FF00FF00FF00FFULL;
600 
601     while (height--) {
602         /* *INDENT-OFF* */
603         DUFFS_LOOP4({
604         Uint32 alpha;
605 
606         _m_prefetch(srcp + 16);
607         _m_prefetch(dstp + 16);
608 
609         alpha = *srcp & amask;
610         if (alpha == 0) {
611             /* do nothing */
612         } else if (alpha == amask) {
613             *dstp = *srcp;
614         } else {
615             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
616             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
617 
618             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
619             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
620 
621             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
622             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
623             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
624             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
625             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
626             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
627 
628 
629             /* blend */
630             src1 = _mm_mullo_pi16(src1, mm_alpha);
631             src1 = _mm_srli_pi16(src1, 8);
632             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
633             dst1 = _mm_srli_pi16(dst1, 8);
634             dst1 = _mm_add_pi16(src1, dst1);
635             dst1 = _mm_packs_pu16(dst1, mm_zero);
636 
637             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
638         }
639         ++srcp;
640         ++dstp;
641         }, width);
642         /* *INDENT-ON* */
643         srcp += srcskip;
644         dstp += dstskip;
645     }
646     _mm_empty();
647 }
648 
649 #endif /* __3dNOW__ */
650 
651 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
652 
653 /* blend a single 16 bit pixel at 50% */
654 #define BLEND16_50(d, s, mask)                        \
655     ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
656 
657 /* blend two 16 bit pixels at 50% */
658 #define BLEND2x16_50(d, s, mask)                         \
659     (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
660      + (s & d & (~(mask | mask << 16))))
661 
662 static void
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info,Uint16 mask)663 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
664 {
665     int width = info->dst_w;
666     int height = info->dst_h;
667     Uint16 *srcp = (Uint16 *) info->src;
668     int srcskip = info->src_skip >> 1;
669     Uint16 *dstp = (Uint16 *) info->dst;
670     int dstskip = info->dst_skip >> 1;
671 
672     while (height--) {
673         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
674             /*
675              * Source and destination not aligned, pipeline it.
676              * This is mostly a win for big blits but no loss for
677              * small ones
678              */
679             Uint32 prev_sw;
680             int w = width;
681 
682             /* handle odd destination */
683             if ((uintptr_t) dstp & 2) {
684                 Uint16 d = *dstp, s = *srcp;
685                 *dstp = BLEND16_50(d, s, mask);
686                 dstp++;
687                 srcp++;
688                 w--;
689             }
690             srcp++;             /* srcp is now 32-bit aligned */
691 
692             /* bootstrap pipeline with first halfword */
693             prev_sw = ((Uint32 *) srcp)[-1];
694 
695             while (w > 1) {
696                 Uint32 sw, dw, s;
697                 sw = *(Uint32 *) srcp;
698                 dw = *(Uint32 *) dstp;
699 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
700                 s = (prev_sw << 16) + (sw >> 16);
701 #else
702                 s = (prev_sw >> 16) + (sw << 16);
703 #endif
704                 prev_sw = sw;
705                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
706                 dstp += 2;
707                 srcp += 2;
708                 w -= 2;
709             }
710 
711             /* final pixel if any */
712             if (w) {
713                 Uint16 d = *dstp, s;
714 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
715                 s = (Uint16) prev_sw;
716 #else
717                 s = (Uint16) (prev_sw >> 16);
718 #endif
719                 *dstp = BLEND16_50(d, s, mask);
720                 srcp++;
721                 dstp++;
722             }
723             srcp += srcskip - 1;
724             dstp += dstskip;
725         } else {
726             /* source and destination are aligned */
727             int w = width;
728 
729             /* first odd pixel? */
730             if ((uintptr_t) srcp & 2) {
731                 Uint16 d = *dstp, s = *srcp;
732                 *dstp = BLEND16_50(d, s, mask);
733                 srcp++;
734                 dstp++;
735                 w--;
736             }
737             /* srcp and dstp are now 32-bit aligned */
738 
739             while (w > 1) {
740                 Uint32 sw = *(Uint32 *) srcp;
741                 Uint32 dw = *(Uint32 *) dstp;
742                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
743                 srcp += 2;
744                 dstp += 2;
745                 w -= 2;
746             }
747 
748             /* last odd pixel? */
749             if (w) {
750                 Uint16 d = *dstp, s = *srcp;
751                 *dstp = BLEND16_50(d, s, mask);
752                 srcp++;
753                 dstp++;
754             }
755             srcp += srcskip;
756             dstp += dstskip;
757         }
758     }
759 }
760 
761 #ifdef __MMX__
762 
763 /* fast RGB565->RGB565 blending with surface alpha */
764 static void
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)765 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
766 {
767     unsigned alpha = info->a;
768     if (alpha == 128) {
769         Blit16to16SurfaceAlpha128(info, 0xf7de);
770     } else {
771         int width = info->dst_w;
772         int height = info->dst_h;
773         Uint16 *srcp = (Uint16 *) info->src;
774         int srcskip = info->src_skip >> 1;
775         Uint16 *dstp = (Uint16 *) info->dst;
776         int dstskip = info->dst_skip >> 1;
777         Uint32 s, d;
778 
779         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
780 
781         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
782         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
783         alpha >>= 3;            /* downscale alpha to 5 bits */
784 
785         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
786         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
787         /* position alpha to allow for mullo and mulhi on diff channels
788            to reduce the number of operations */
789         mm_alpha = _mm_slli_si64(mm_alpha, 3);
790 
791         /* Setup the 565 color channel masks */
792         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
793         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
794 
795         while (height--) {
796             /* *INDENT-OFF* */
797             DUFFS_LOOP_124(
798             {
799                 s = *srcp++;
800                 d = *dstp;
801                 /*
802                  * shift out the middle component (green) to
803                  * the high 16 bits, and process all three RGB
804                  * components at the same time.
805                  */
806                 s = (s | s << 16) & 0x07e0f81f;
807                 d = (d | d << 16) & 0x07e0f81f;
808                 d += (s - d) * alpha >> 5;
809                 d &= 0x07e0f81f;
810                 *dstp++ = (Uint16)(d | d >> 16);
811             },{
812                 s = *srcp++;
813                 d = *dstp;
814                 /*
815                  * shift out the middle component (green) to
816                  * the high 16 bits, and process all three RGB
817                  * components at the same time.
818                  */
819                 s = (s | s << 16) & 0x07e0f81f;
820                 d = (d | d << 16) & 0x07e0f81f;
821                 d += (s - d) * alpha >> 5;
822                 d &= 0x07e0f81f;
823                 *dstp++ = (Uint16)(d | d >> 16);
824                 s = *srcp++;
825                 d = *dstp;
826                 /*
827                  * shift out the middle component (green) to
828                  * the high 16 bits, and process all three RGB
829                  * components at the same time.
830                  */
831                 s = (s | s << 16) & 0x07e0f81f;
832                 d = (d | d << 16) & 0x07e0f81f;
833                 d += (s - d) * alpha >> 5;
834                 d &= 0x07e0f81f;
835                 *dstp++ = (Uint16)(d | d >> 16);
836             },{
837                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
838                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
839 
840                 /* red */
841                 src2 = src1;
842                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
843 
844                 dst2 = dst1;
845                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
846 
847                 /* blend */
848                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
849                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
850                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
851                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
852                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
853 
854                 mm_res = dst2; /* RED -> mm_res */
855 
856                 /* green -- process the bits in place */
857                 src2 = src1;
858                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
859 
860                 dst2 = dst1;
861                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
862 
863                 /* blend */
864                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
865                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
866                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
867                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
868 
869                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
870 
871                 /* blue */
872                 src2 = src1;
873                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
874 
875                 dst2 = dst1;
876                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
877 
878                 /* blend */
879                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
880                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
881                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
882                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
883                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
884 
885                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
886 
887                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
888 
889                 srcp += 4;
890                 dstp += 4;
891             }, width);
892             /* *INDENT-ON* */
893             srcp += srcskip;
894             dstp += dstskip;
895         }
896         _mm_empty();
897     }
898 }
899 
900 /* fast RGB555->RGB555 blending with surface alpha */
901 static void
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)902 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
903 {
904     unsigned alpha = info->a;
905     if (alpha == 128) {
906         Blit16to16SurfaceAlpha128(info, 0xfbde);
907     } else {
908         int width = info->dst_w;
909         int height = info->dst_h;
910         Uint16 *srcp = (Uint16 *) info->src;
911         int srcskip = info->src_skip >> 1;
912         Uint16 *dstp = (Uint16 *) info->dst;
913         int dstskip = info->dst_skip >> 1;
914         Uint32 s, d;
915 
916         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
917 
918         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
919         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
920         alpha >>= 3;            /* downscale alpha to 5 bits */
921 
922         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
923         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
924         /* position alpha to allow for mullo and mulhi on diff channels
925            to reduce the number of operations */
926         mm_alpha = _mm_slli_si64(mm_alpha, 3);
927 
928         /* Setup the 555 color channel masks */
929         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
930         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
931         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
932 
933         while (height--) {
934             /* *INDENT-OFF* */
935             DUFFS_LOOP_124(
936             {
937                 s = *srcp++;
938                 d = *dstp;
939                 /*
940                  * shift out the middle component (green) to
941                  * the high 16 bits, and process all three RGB
942                  * components at the same time.
943                  */
944                 s = (s | s << 16) & 0x03e07c1f;
945                 d = (d | d << 16) & 0x03e07c1f;
946                 d += (s - d) * alpha >> 5;
947                 d &= 0x03e07c1f;
948                 *dstp++ = (Uint16)(d | d >> 16);
949             },{
950                 s = *srcp++;
951                 d = *dstp;
952                 /*
953                  * shift out the middle component (green) to
954                  * the high 16 bits, and process all three RGB
955                  * components at the same time.
956                  */
957                 s = (s | s << 16) & 0x03e07c1f;
958                 d = (d | d << 16) & 0x03e07c1f;
959                 d += (s - d) * alpha >> 5;
960                 d &= 0x03e07c1f;
961                 *dstp++ = (Uint16)(d | d >> 16);
962                     s = *srcp++;
963                 d = *dstp;
964                 /*
965                  * shift out the middle component (green) to
966                  * the high 16 bits, and process all three RGB
967                  * components at the same time.
968                  */
969                 s = (s | s << 16) & 0x03e07c1f;
970                 d = (d | d << 16) & 0x03e07c1f;
971                 d += (s - d) * alpha >> 5;
972                 d &= 0x03e07c1f;
973                 *dstp++ = (Uint16)(d | d >> 16);
974             },{
975                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
976                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
977 
978                 /* red -- process the bits in place */
979                 src2 = src1;
980                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
981 
982                 dst2 = dst1;
983                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
984 
985                 /* blend */
986                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
987                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
988                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
989                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
990                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
991 
992                 mm_res = dst2; /* RED -> mm_res */
993 
994                 /* green -- process the bits in place */
995                 src2 = src1;
996                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
997 
998                 dst2 = dst1;
999                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
1000 
1001                 /* blend */
1002                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1003                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1004                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
1005                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1006 
1007                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
1008 
1009                 /* blue */
1010                 src2 = src1; /* src -> src2 */
1011                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
1012 
1013                 dst2 = dst1; /* dst -> dst2 */
1014                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
1015 
1016                 /* blend */
1017                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
1018                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
1019                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
1020                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
1021                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
1022 
1023                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
1024 
1025                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
1026 
1027                 srcp += 4;
1028                 dstp += 4;
1029             }, width);
1030             /* *INDENT-ON* */
1031             srcp += srcskip;
1032             dstp += dstskip;
1033         }
1034         _mm_empty();
1035     }
1036 }
1037 
1038 #endif /* __MMX__ */
1039 
1040 /* fast RGB565->RGB565 blending with surface alpha */
1041 static void
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)1042 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
1043 {
1044     unsigned alpha = info->a;
1045     if (alpha == 128) {
1046         Blit16to16SurfaceAlpha128(info, 0xf7de);
1047     } else {
1048         int width = info->dst_w;
1049         int height = info->dst_h;
1050         Uint16 *srcp = (Uint16 *) info->src;
1051         int srcskip = info->src_skip >> 1;
1052         Uint16 *dstp = (Uint16 *) info->dst;
1053         int dstskip = info->dst_skip >> 1;
1054         alpha >>= 3;            /* downscale alpha to 5 bits */
1055 
1056         while (height--) {
1057             /* *INDENT-OFF* */
1058             DUFFS_LOOP4({
1059                 Uint32 s = *srcp++;
1060                 Uint32 d = *dstp;
1061                 /*
1062                  * shift out the middle component (green) to
1063                  * the high 16 bits, and process all three RGB
1064                  * components at the same time.
1065                  */
1066                 s = (s | s << 16) & 0x07e0f81f;
1067                 d = (d | d << 16) & 0x07e0f81f;
1068                 d += (s - d) * alpha >> 5;
1069                 d &= 0x07e0f81f;
1070                 *dstp++ = (Uint16)(d | d >> 16);
1071             }, width);
1072             /* *INDENT-ON* */
1073             srcp += srcskip;
1074             dstp += dstskip;
1075         }
1076     }
1077 }
1078 
1079 /* fast RGB555->RGB555 blending with surface alpha */
1080 static void
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)1081 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
1082 {
1083     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
1084     if (alpha == 128) {
1085         Blit16to16SurfaceAlpha128(info, 0xfbde);
1086     } else {
1087         int width = info->dst_w;
1088         int height = info->dst_h;
1089         Uint16 *srcp = (Uint16 *) info->src;
1090         int srcskip = info->src_skip >> 1;
1091         Uint16 *dstp = (Uint16 *) info->dst;
1092         int dstskip = info->dst_skip >> 1;
1093         alpha >>= 3;            /* downscale alpha to 5 bits */
1094 
1095         while (height--) {
1096             /* *INDENT-OFF* */
1097             DUFFS_LOOP4({
1098                 Uint32 s = *srcp++;
1099                 Uint32 d = *dstp;
1100                 /*
1101                  * shift out the middle component (green) to
1102                  * the high 16 bits, and process all three RGB
1103                  * components at the same time.
1104                  */
1105                 s = (s | s << 16) & 0x03e07c1f;
1106                 d = (d | d << 16) & 0x03e07c1f;
1107                 d += (s - d) * alpha >> 5;
1108                 d &= 0x03e07c1f;
1109                 *dstp++ = (Uint16)(d | d >> 16);
1110             }, width);
1111             /* *INDENT-ON* */
1112             srcp += srcskip;
1113             dstp += dstskip;
1114         }
1115     }
1116 }
1117 
1118 /* fast ARGB8888->RGB565 blending with pixel alpha */
1119 static void
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)1120 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
1121 {
1122     int width = info->dst_w;
1123     int height = info->dst_h;
1124     Uint32 *srcp = (Uint32 *) info->src;
1125     int srcskip = info->src_skip >> 2;
1126     Uint16 *dstp = (Uint16 *) info->dst;
1127     int dstskip = info->dst_skip >> 1;
1128 
1129     while (height--) {
1130         /* *INDENT-OFF* */
1131         DUFFS_LOOP4({
1132         Uint32 s = *srcp;
1133         unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1134         /* FIXME: Here we special-case opaque alpha since the
1135            compositioning used (>>8 instead of /255) doesn't handle
1136            it correctly. Also special-case alpha=0 for speed?
1137            Benchmark this! */
1138         if(alpha) {
1139           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1140             *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
1141           } else {
1142             Uint32 d = *dstp;
1143             /*
1144              * convert source and destination to G0RAB65565
1145              * and blend all components at the same time
1146              */
1147             s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1148               + (s >> 3 & 0x1f);
1149             d = (d | d << 16) & 0x07e0f81f;
1150             d += (s - d) * alpha >> 5;
1151             d &= 0x07e0f81f;
1152             *dstp = (Uint16)(d | d >> 16);
1153           }
1154         }
1155         srcp++;
1156         dstp++;
1157         }, width);
1158         /* *INDENT-ON* */
1159         srcp += srcskip;
1160         dstp += dstskip;
1161     }
1162 }
1163 
1164 /* fast ARGB8888->RGB555 blending with pixel alpha */
1165 static void
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)1166 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
1167 {
1168     int width = info->dst_w;
1169     int height = info->dst_h;
1170     Uint32 *srcp = (Uint32 *) info->src;
1171     int srcskip = info->src_skip >> 2;
1172     Uint16 *dstp = (Uint16 *) info->dst;
1173     int dstskip = info->dst_skip >> 1;
1174 
1175     while (height--) {
1176         /* *INDENT-OFF* */
1177         DUFFS_LOOP4({
1178         unsigned alpha;
1179         Uint32 s = *srcp;
1180         alpha = s >> 27; /* downscale alpha to 5 bits */
1181         /* FIXME: Here we special-case opaque alpha since the
1182            compositioning used (>>8 instead of /255) doesn't handle
1183            it correctly. Also special-case alpha=0 for speed?
1184            Benchmark this! */
1185         if(alpha) {
1186           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1187             *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
1188           } else {
1189             Uint32 d = *dstp;
1190             /*
1191              * convert source and destination to G0RAB65565
1192              * and blend all components at the same time
1193              */
1194             s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1195               + (s >> 3 & 0x1f);
1196             d = (d | d << 16) & 0x03e07c1f;
1197             d += (s - d) * alpha >> 5;
1198             d &= 0x03e07c1f;
1199             *dstp = (Uint16)(d | d >> 16);
1200           }
1201         }
1202         srcp++;
1203         dstp++;
1204         }, width);
1205         /* *INDENT-ON* */
1206         srcp += srcskip;
1207         dstp += dstskip;
1208     }
1209 }
1210 
1211 /* General (slow) N->N blending with per-surface alpha */
1212 static void
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)1213 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
1214 {
1215     int width = info->dst_w;
1216     int height = info->dst_h;
1217     Uint8 *src = info->src;
1218     int srcskip = info->src_skip;
1219     Uint8 *dst = info->dst;
1220     int dstskip = info->dst_skip;
1221     SDL_PixelFormat *srcfmt = info->src_fmt;
1222     SDL_PixelFormat *dstfmt = info->dst_fmt;
1223     int srcbpp = srcfmt->BytesPerPixel;
1224     int dstbpp = dstfmt->BytesPerPixel;
1225     Uint32 Pixel;
1226     unsigned sR, sG, sB;
1227     unsigned dR, dG, dB, dA;
1228     const unsigned sA = info->a;
1229 
1230     if (sA) {
1231         while (height--) {
1232         /* *INDENT-OFF* */
1233         DUFFS_LOOP4(
1234         {
1235         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1236         DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1237         ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1238         ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1239         src += srcbpp;
1240         dst += dstbpp;
1241         },
1242         width);
1243         /* *INDENT-ON* */
1244             src += srcskip;
1245             dst += dstskip;
1246         }
1247     }
1248 }
1249 
1250 /* General (slow) colorkeyed N->N blending with per-surface alpha */
1251 static void
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)1252 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
1253 {
1254     int width = info->dst_w;
1255     int height = info->dst_h;
1256     Uint8 *src = info->src;
1257     int srcskip = info->src_skip;
1258     Uint8 *dst = info->dst;
1259     int dstskip = info->dst_skip;
1260     SDL_PixelFormat *srcfmt = info->src_fmt;
1261     SDL_PixelFormat *dstfmt = info->dst_fmt;
1262     Uint32 ckey = info->colorkey;
1263     int srcbpp = srcfmt->BytesPerPixel;
1264     int dstbpp = dstfmt->BytesPerPixel;
1265     Uint32 Pixel;
1266     unsigned sR, sG, sB;
1267     unsigned dR, dG, dB, dA;
1268     const unsigned sA = info->a;
1269 
1270     while (height--) {
1271         /* *INDENT-OFF* */
1272         DUFFS_LOOP4(
1273         {
1274         RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1275         if(sA && Pixel != ckey) {
1276             RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1277             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1278             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1279             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1280         }
1281         src += srcbpp;
1282         dst += dstbpp;
1283         },
1284         width);
1285         /* *INDENT-ON* */
1286         src += srcskip;
1287         dst += dstskip;
1288     }
1289 }
1290 
1291 /* General (slow) N->N blending with pixel alpha */
1292 static void
BlitNtoNPixelAlpha(SDL_BlitInfo * info)1293 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
1294 {
1295     int width = info->dst_w;
1296     int height = info->dst_h;
1297     Uint8 *src = info->src;
1298     int srcskip = info->src_skip;
1299     Uint8 *dst = info->dst;
1300     int dstskip = info->dst_skip;
1301     SDL_PixelFormat *srcfmt = info->src_fmt;
1302     SDL_PixelFormat *dstfmt = info->dst_fmt;
1303     int srcbpp;
1304     int dstbpp;
1305     Uint32 Pixel;
1306     unsigned sR, sG, sB, sA;
1307     unsigned dR, dG, dB, dA;
1308 
1309     /* Set up some basic variables */
1310     srcbpp = srcfmt->BytesPerPixel;
1311     dstbpp = dstfmt->BytesPerPixel;
1312 
1313     while (height--) {
1314         /* *INDENT-OFF* */
1315         DUFFS_LOOP4(
1316         {
1317         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1318         if(sA) {
1319             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1320             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1321             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1322         }
1323         src += srcbpp;
1324         dst += dstbpp;
1325         },
1326         width);
1327         /* *INDENT-ON* */
1328         src += srcskip;
1329         dst += dstskip;
1330     }
1331 }
1332 
1333 
1334 SDL_BlitFunc
SDL_CalculateBlitA(SDL_Surface * surface)1335 SDL_CalculateBlitA(SDL_Surface * surface)
1336 {
1337     SDL_PixelFormat *sf = surface->format;
1338     SDL_PixelFormat *df = surface->map->dst->format;
1339 
1340     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1341     case SDL_COPY_BLEND:
1342         /* Per-pixel alpha blits */
1343         switch (df->BytesPerPixel) {
1344         case 1:
1345             if (df->palette != NULL) {
1346                 return BlitNto1PixelAlpha;
1347             } else {
1348                 /* RGB332 has no palette ! */
1349                 return BlitNtoNPixelAlpha;
1350             }
1351 
1352         case 2:
1353 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
1354                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1355                     && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
1356                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1357                     || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
1358                 {
1359 #if SDL_ARM_NEON_BLITTERS
1360                     if (SDL_HasNEON())
1361                         return BlitARGBto565PixelAlphaARMNEON;
1362 #endif
1363 #if SDL_ARM_SIMD_BLITTERS
1364                     if (SDL_HasARMSIMD())
1365                         return BlitARGBto565PixelAlphaARMSIMD;
1366 #endif
1367                 }
1368 #endif
1369                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1370                     && sf->Gmask == 0xff00
1371                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1372                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1373                 if (df->Gmask == 0x7e0)
1374                     return BlitARGBto565PixelAlpha;
1375                 else if (df->Gmask == 0x3e0)
1376                     return BlitARGBto555PixelAlpha;
1377             }
1378             return BlitNtoNPixelAlpha;
1379 
1380         case 4:
1381             if (sf->Rmask == df->Rmask
1382                 && sf->Gmask == df->Gmask
1383                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1384 #if defined(__MMX__) || defined(__3dNOW__)
1385                 if (sf->Rshift % 8 == 0
1386                     && sf->Gshift % 8 == 0
1387                     && sf->Bshift % 8 == 0
1388                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1389 #ifdef __3dNOW__
1390                     if (SDL_Has3DNow())
1391                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1392 #endif
1393 #ifdef __MMX__
1394                     if (SDL_HasMMX())
1395                         return BlitRGBtoRGBPixelAlphaMMX;
1396 #endif
1397                 }
1398 #endif /* __MMX__ || __3dNOW__ */
1399                 if (sf->Amask == 0xff000000) {
1400 #if SDL_ARM_NEON_BLITTERS
1401                     if (SDL_HasNEON())
1402                         return BlitRGBtoRGBPixelAlphaARMNEON;
1403 #endif
1404 #if SDL_ARM_SIMD_BLITTERS
1405                     if (SDL_HasARMSIMD())
1406                         return BlitRGBtoRGBPixelAlphaARMSIMD;
1407 #endif
1408                     return BlitRGBtoRGBPixelAlpha;
1409                 }
1410             }
1411             return BlitNtoNPixelAlpha;
1412 
1413         case 3:
1414         default:
1415             break;
1416         }
1417         return BlitNtoNPixelAlpha;
1418 
1419     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1420         if (sf->Amask == 0) {
1421             /* Per-surface alpha blits */
1422             switch (df->BytesPerPixel) {
1423             case 1:
1424                 if (df->palette != NULL) {
1425                     return BlitNto1SurfaceAlpha;
1426                 } else {
1427                     /* RGB332 has no palette ! */
1428                     return BlitNtoNSurfaceAlpha;
1429                 }
1430 
1431             case 2:
1432                 if (surface->map->identity) {
1433                     if (df->Gmask == 0x7e0) {
1434 #ifdef __MMX__
1435                         if (SDL_HasMMX())
1436                             return Blit565to565SurfaceAlphaMMX;
1437                         else
1438 #endif
1439                             return Blit565to565SurfaceAlpha;
1440                     } else if (df->Gmask == 0x3e0) {
1441 #ifdef __MMX__
1442                         if (SDL_HasMMX())
1443                             return Blit555to555SurfaceAlphaMMX;
1444                         else
1445 #endif
1446                             return Blit555to555SurfaceAlpha;
1447                     }
1448                 }
1449                 return BlitNtoNSurfaceAlpha;
1450 
1451             case 4:
1452                 if (sf->Rmask == df->Rmask
1453                     && sf->Gmask == df->Gmask
1454                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1455 #ifdef __MMX__
1456                     if (sf->Rshift % 8 == 0
1457                         && sf->Gshift % 8 == 0
1458                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
1459                         return BlitRGBtoRGBSurfaceAlphaMMX;
1460 #endif
1461                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1462                         return BlitRGBtoRGBSurfaceAlpha;
1463                     }
1464                 }
1465                 return BlitNtoNSurfaceAlpha;
1466 
1467             case 3:
1468             default:
1469                 return BlitNtoNSurfaceAlpha;
1470             }
1471         }
1472         break;
1473 
1474     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1475         if (sf->Amask == 0) {
1476             if (df->BytesPerPixel == 1) {
1477 
1478                 if (df->palette != NULL) {
1479                     return BlitNto1SurfaceAlphaKey;
1480                 } else {
1481                     /* RGB332 has no palette ! */
1482                     return BlitNtoNSurfaceAlphaKey;
1483                 }
1484             } else {
1485                 return BlitNtoNSurfaceAlphaKey;
1486             }
1487         }
1488         break;
1489     }
1490 
1491     return NULL;
1492 }
1493 
1494 #endif /* SDL_HAVE_BLIT_A */
1495 
1496 /* vi: set ts=4 sw=4 expandtab: */
1497