1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // NEON common code.
11
12 #ifndef WEBP_DSP_NEON_H_
13 #define WEBP_DSP_NEON_H_
14
15 #include <arm_neon.h>
16
17 #include "src/dsp/dsp.h"
18
19 // Right now, some intrinsics functions seem slower, so we disable them
20 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
21 // incompatible.
22 #if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
23 #define WEBP_USE_INTRINSICS // use intrinsics when possible
24 #endif
25
26 #define INIT_VECTOR2(v, a, b) do { \
27 v.val[0] = a; \
28 v.val[1] = b; \
29 } while (0)
30
31 #define INIT_VECTOR3(v, a, b, c) do { \
32 v.val[0] = a; \
33 v.val[1] = b; \
34 v.val[2] = c; \
35 } while (0)
36
37 #define INIT_VECTOR4(v, a, b, c, d) do { \
38 v.val[0] = a; \
39 v.val[1] = b; \
40 v.val[2] = c; \
41 v.val[3] = d; \
42 } while (0)
43
44 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
45 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
46 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
47 #if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
48 #define WORK_AROUND_GCC
49 #endif
50
Transpose4x4_NEON(const int32x4x4_t rows)51 static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
52 uint64x2x2_t row01, row23;
53
54 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
55 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
56 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
57 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
58 // Transpose 64-bit values (there's no vswp equivalent)
59 {
60 const uint64x1_t row0h = vget_high_u64(row01.val[0]);
61 const uint64x1_t row2l = vget_low_u64(row23.val[0]);
62 const uint64x1_t row1h = vget_high_u64(row01.val[1]);
63 const uint64x1_t row3l = vget_low_u64(row23.val[1]);
64 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
65 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
66 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
67 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
68 }
69 {
70 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
71 vreinterpretq_s32_u64(row01.val[1]));
72 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
73 vreinterpretq_s32_u64(row23.val[1]));
74 int32x4x4_t out;
75 out.val[0] = out01.val[0];
76 out.val[1] = out01.val[1];
77 out.val[2] = out23.val[0];
78 out.val[3] = out23.val[1];
79 return out;
80 }
81 }
82
83 #if 0 // Useful debug macro.
84 #include <stdio.h>
85 #define PRINT_REG(REG, SIZE) do { \
86 int i; \
87 printf("%s \t[%d]: 0x", #REG, SIZE); \
88 if (SIZE == 8) { \
89 uint8_t _tmp[8]; \
90 vst1_u8(_tmp, (REG)); \
91 for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \
92 } else if (SIZE == 16) { \
93 uint16_t _tmp[4]; \
94 vst1_u16(_tmp, (REG)); \
95 for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \
96 } \
97 printf("\n"); \
98 } while (0)
99 #endif
100
101 #endif // WEBP_DSP_NEON_H_
102