1 #include <stdbool.h>
2 
3 #if defined(__i386__) && VEC_SIZE == 16
4 # define ENTRY(name) \
5 asm ( "\t.text\n" \
6       "\t.globl _start\n" \
7       "_start:\n" \
8       "\tpush %ebp\n" \
9       "\tmov %esp,%ebp\n" \
10       "\tand $~0xf,%esp\n" \
11       "\tcall " #name "\n" \
12       "\tleave\n" \
13       "\tret" )
14 #else
15 # define ENTRY(name) \
16 asm ( "\t.text\n" \
17       "\t.globl _start\n" \
18       "_start:\n" \
19       "\tjmp " #name )
20 #endif
21 
22 typedef
23 #if defined(INT_SIZE)
24 # define ELEM_SIZE INT_SIZE
25 signed int
26 # if INT_SIZE == 1
27 #  define MODE QI
28 # elif INT_SIZE == 2
29 #  define MODE HI
30 # elif INT_SIZE == 4
31 #  define MODE SI
32 # elif INT_SIZE == 8
33 #  define MODE DI
34 # endif
35 #elif defined(UINT_SIZE)
36 # define ELEM_SIZE UINT_SIZE
37 unsigned int
38 # if UINT_SIZE == 1
39 #  define MODE QI
40 # elif UINT_SIZE == 2
41 #  define MODE HI
42 # elif UINT_SIZE == 4
43 #  define MODE SI
44 # elif UINT_SIZE == 8
45 #  define MODE DI
46 # endif
47 #elif defined(FLOAT_SIZE)
48 float
49 # define ELEM_SIZE FLOAT_SIZE
50 # if FLOAT_SIZE == 4
51 #  define MODE SF
52 #  define ELEM_SFX "s"
53 # elif FLOAT_SIZE == 8
54 #  define MODE DF
55 #  define ELEM_SFX "d"
56 # elif FLOAT_SIZE == 2
57 #  define MODE HF
58 #  define ELEM_SFX "h"
59 # endif
60 #endif
61 #ifndef VEC_SIZE
62 # define VEC_SIZE ELEM_SIZE
63 #endif
64 __attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
65 
66 #define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
67 
68 typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
69 
70 /* Various builtins want plain char / int / long long vector types ... */
71 typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
72 typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
73 #if VEC_SIZE >= 4
74 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
75 typedef float __attribute__((vector_size(VEC_SIZE))) vsf_t;
76 #endif
77 #if VEC_SIZE >= 8
78 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
79 typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
80 #endif
81 
82 #if ELEM_SIZE == 1
83 typedef vqi_t vint_t;
84 #elif ELEM_SIZE == 2
85 typedef vhi_t vint_t;
86 #elif ELEM_SIZE == 4
87 typedef vsi_t vint_t;
88 #elif ELEM_SIZE == 8
89 typedef vdi_t vint_t;
90 #endif
91 
92 #if VEC_SIZE >= 16
93 
94 # if ELEM_COUNT >= 2
95 #  if VEC_SIZE > 32
96 #   define HALF_SIZE (VEC_SIZE / 2)
97 #  else
98 #   define HALF_SIZE 16
99 #  endif
100 typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
101 typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
102 typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
103 typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
104 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
105 #ifdef __AVX512FP16__
106 typedef _Float16 __attribute__((vector_size(HALF_SIZE))) vhf_half_t;
107 #endif
108 typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
109 # endif
110 
111 # if ELEM_COUNT >= 4
112 #  if VEC_SIZE > 64
113 #   define QUARTER_SIZE (VEC_SIZE / 4)
114 #  else
115 #   define QUARTER_SIZE 16
116 #  endif
117 typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) quarter_t;
118 typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t;
119 typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
120 typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
121 typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
122 #ifdef __AVX512FP16__
123 typedef _Float16 __attribute__((vector_size(QUARTER_SIZE))) vhf_quarter_t;
124 #endif
125 # endif
126 
127 # if ELEM_COUNT >= 8
128 #  if VEC_SIZE > 128
129 #   define EIGHTH_SIZE (VEC_SIZE / 8)
130 #  else
131 #   define EIGHTH_SIZE 16
132 #  endif
133 typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) eighth_t;
134 typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t;
135 typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t;
136 typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t;
137 typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
138 # endif
139 
140 # define DECL_PAIR(w) \
141 typedef w ## _t pair_t; \
142 typedef vsi_ ## w ## _t vsi_pair_t; \
143 typedef vdi_ ## w ## _t vdi_pair_t
144 # define DECL_QUARTET(w) \
145 typedef w ## _t quartet_t; \
146 typedef vsi_ ## w ## _t vsi_quartet_t; \
147 typedef vdi_ ## w ## _t vdi_quartet_t
148 # define DECL_OCTET(w) \
149 typedef w ## _t octet_t; \
150 typedef vsi_ ## w ## _t vsi_octet_t; \
151 typedef vdi_ ## w ## _t vdi_octet_t
152 
153 # if ELEM_COUNT == 4
154 DECL_PAIR(half);
155 # elif ELEM_COUNT == 8
156 DECL_PAIR(quarter);
157 DECL_QUARTET(half);
158 # elif ELEM_COUNT == 16
159 DECL_PAIR(eighth);
160 DECL_QUARTET(quarter);
161 DECL_OCTET(half);
162 # endif
163 
164 # undef DECL_OCTET
165 # undef DECL_QUARTET
166 # undef DECL_PAIR
167 
168 #endif
169 
170 #if VEC_SIZE == 16
171 # define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
172 # define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
173 #elif VEC_SIZE == 32
174 # define B(n, s, a...)   __builtin_ia32_ ## n ## 256 ## s(a)
175 #elif VEC_SIZE == 64
176 # define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
177 # define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
178 # define BR2(n, s, a...) __builtin_ia32_ ## n ## 512 ## s ## _round(a, 4)
179 #endif
180 #ifndef B_
181 # define B_ B
182 #endif
183 #ifndef BR
184 # define BR B
185 # define BR_ B_
186 #endif
187 #ifndef BR2
188 # define BR2 BR
189 #endif
190 #ifndef BR_
191 # define BR_ BR
192 #endif
193 
194 #ifdef __AVX512F__
195 
196 /* Sadly there are a few exceptions to the general naming rules. */
197 # define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512
198 # define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512
199 # define __builtin_ia32_exp2pd512_mask __builtin_ia32_exp2pd_mask
200 # define __builtin_ia32_exp2ps512_mask __builtin_ia32_exp2ps_mask
201 # define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask
202 # define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask
203 # define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask
204 # define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
205 # define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
206 # define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
207 # define __builtin_ia32_rcp28pd512_mask __builtin_ia32_rcp28pd_mask
208 # define __builtin_ia32_rcp28ps512_mask __builtin_ia32_rcp28ps_mask
209 # define __builtin_ia32_rndscalepd_512_mask __builtin_ia32_rndscalepd_mask
210 # define __builtin_ia32_rndscaleps_512_mask __builtin_ia32_rndscaleps_mask
211 # define __builtin_ia32_rsqrt28pd512_mask __builtin_ia32_rsqrt28pd_mask
212 # define __builtin_ia32_rsqrt28ps512_mask __builtin_ia32_rsqrt28ps_mask
213 # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
214 # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
215 # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
216 # define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
217 
218 # if VEC_SIZE > ELEM_SIZE && (defined(VEC_MAX) ? VEC_MAX : VEC_SIZE) < 64
219 #  pragma GCC target ( "avx512vl" )
220 # endif
221 
222 # define REN(insn, old, new)                     \
223     asm ( ".macro v" #insn #old " o:vararg \n\t" \
224           "v" #insn #new " \\o             \n\t" \
225           ".endm" )
226 
227 /*
228  * The original plan was to effect use of EVEX encodings for scalar as well as
229  * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
230  * only of course) XMM16-XMM31 only. All sorts of compiler errors result when
231  * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes,
232  * which has the benefit of also working for 32-bit. Granted, there is a lot of
233  * escaping to get right here.
234  */
235 asm ( ".macro override insn    \n\t"
236       ".macro $\\insn o:vararg \n\t"
237       ".purgem \\insn          \n\t"
238       "{evex} \\insn \\(\\)o   \n\t"
239       ".macro \\insn o:vararg  \n\t"
240       "$\\insn \\(\\(\\))o     \n\t"
241       ".endm                   \n\t"
242       ".endm                   \n\t"
243       ".macro \\insn o:vararg  \n\t"
244       "$\\insn \\(\\)o         \n\t"
245       ".endm                   \n\t"
246       ".endm" );
247 
248 # define OVR(n) asm ( "override v" #n )
249 # define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss)
250 
251 # ifdef __AVX512VL__
252 #  ifdef __AVX512BW__
253 #   define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w)
254 #  else
255 #   define OVR_BW(n)
256 #  endif
257 #  define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q)
258 #  define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps)
259 # else
260 #  define OVR_BW(n)
261 #  define OVR_DQ(n)
262 #  define OVR_VFP(n)
263 # endif
264 
265 # define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \
266                        OVR_ ## w(n ## 231)
267 # define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
268 # define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
269 
270 OVR_INT(broadcast);
271 # ifdef __AVX512VL__
272 OVR_SFP(broadcast);
273 # endif
274 OVR_SFP(comi);
275 OVR_VFP(cvtdq2);
276 OVR_INT(abs);
277 OVR_FP(add);
278 OVR_INT(add);
279 OVR_BW(adds);
280 OVR_BW(addus);
281 OVR_BW(avg);
282 OVR_FP(div);
283 OVR(extractps);
284 OVR_FMA(fmadd, FP);
285 OVR_FMA(fmaddsub, VFP);
286 OVR_FMA(fmsub, FP);
287 OVR_FMA(fmsubadd, VFP);
288 OVR_FMA(fnmadd, FP);
289 OVR_FMA(fnmsub, FP);
290 OVR(insertps);
291 OVR_FP(max);
292 OVR_INT(maxs);
293 OVR_INT(maxu);
294 OVR_FP(min);
295 OVR_INT(mins);
296 OVR_INT(minu);
297 OVR(movd);
298 OVR(movq);
299 OVR_SFP(mov);
300 OVR_VFP(mova);
301 OVR(movhlps);
302 OVR(movhpd);
303 OVR(movhps);
304 OVR(movlhps);
305 OVR(movlpd);
306 OVR(movlps);
307 OVR_VFP(movnt);
308 OVR_VFP(movu);
309 OVR_FP(mul);
310 OVR_VFP(perm);
311 OVR_VFP(permil);
312 OVR_VFP(shuf);
313 OVR_INT(sll);
314 OVR_DQ(sllv);
315 OVR_FP(sqrt);
316 OVR_INT(sra);
317 OVR_DQ(srav);
318 OVR_INT(srl);
319 OVR_DQ(srlv);
320 OVR_FP(sub);
321 OVR_INT(sub);
322 OVR_BW(subs);
323 OVR_BW(subus);
324 OVR_SFP(ucomi);
325 OVR_VFP(unpckh);
326 OVR_VFP(unpckl);
327 
328 # ifdef __AVX512VL__
329 #  if ELEM_SIZE == 8 && defined(__AVX512DQ__)
330 REN(extract, f128, f64x2);
331 REN(extract, i128, i64x2);
332 REN(insert, f128, f64x2);
333 REN(insert, i128, i64x2);
334 #  else
335 REN(extract, f128, f32x4);
336 REN(extract, i128, i32x4);
337 REN(insert, f128, f32x4);
338 REN(insert, i128, i32x4);
339 #  endif
340 #  if ELEM_SIZE == 8
341 REN(movdqa, , 64);
342 REN(movdqu, , 64);
343 REN(pand, , q);
344 REN(pandn, , q);
345 REN(por, , q);
346 REN(pxor, , q);
347 #  else
348 #   if ELEM_SIZE == 1 && defined(__AVX512BW__)
349 REN(movdq, a, u8);
350 REN(movdqu, , 8);
351 #   elif ELEM_SIZE == 2 && defined(__AVX512BW__)
352 REN(movdq, a, u16);
353 REN(movdqu, , 16);
354 #   else
355 REN(movdqa, , 32);
356 REN(movdqu, , 32);
357 #   endif
358 REN(pand, , d);
359 REN(pandn, , d);
360 REN(por, , d);
361 REN(pxor, , d);
362 #  endif
363 OVR(aesdec);
364 OVR(aesdeclast);
365 OVR(aesenc);
366 OVR(aesenclast);
367 OVR(cvtpd2dqx);
368 OVR(cvtpd2dqy);
369 OVR(cvtpd2psx);
370 OVR(cvtpd2psy);
371 OVR(cvtph2ps);
372 OVR(cvtps2dq);
373 OVR(cvtps2pd);
374 OVR(cvtps2ph);
375 OVR(cvtsd2ss);
376 OVR(cvtsd2si);
377 OVR(cvtsd2sil);
378 OVR(cvtsd2siq);
379 OVR(cvtsi2sd);
380 OVR(cvtsi2sdl);
381 OVR(cvtsi2sdq);
382 OVR(cvtsi2ss);
383 OVR(cvtsi2ssl);
384 OVR(cvtsi2ssq);
385 OVR(cvtss2sd);
386 OVR(cvtss2si);
387 OVR(cvtss2sil);
388 OVR(cvtss2siq);
389 OVR(cvttpd2dqx);
390 OVR(cvttpd2dqy);
391 OVR(cvttps2dq);
392 OVR(cvttsd2si);
393 OVR(cvttsd2sil);
394 OVR(cvttsd2siq);
395 OVR(cvttss2si);
396 OVR(cvttss2sil);
397 OVR(cvttss2siq);
398 OVR(gf2p8mulb);
399 OVR(movddup);
400 OVR(movntdq);
401 OVR(movntdqa);
402 OVR(movshdup);
403 OVR(movsldup);
404 OVR(pclmulqdq);
405 OVR(permd);
406 OVR(permq);
407 OVR(pmovsxbd);
408 OVR(pmovsxbq);
409 OVR(pmovsxdq);
410 OVR(pmovsxwd);
411 OVR(pmovsxwq);
412 OVR(pmovzxbd);
413 OVR(pmovzxbq);
414 OVR(pmovzxdq);
415 OVR(pmovzxwd);
416 OVR(pmovzxwq);
417 OVR(pmulld);
418 OVR(pmuldq);
419 OVR(pmuludq);
420 OVR(pshufd);
421 OVR(punpckhdq);
422 OVR(punpckhqdq);
423 OVR(punpckldq);
424 OVR(punpcklqdq);
425 # endif
426 
427 # ifdef __AVX512BW__
428 OVR(pextrb);
429 OVR(pextrw);
430 OVR(pinsrb);
431 OVR(pinsrw);
432 #  ifdef __AVX512VL__
433 OVR(packssdw);
434 OVR(packsswb);
435 OVR(packusdw);
436 OVR(packuswb);
437 OVR(palignr);
438 OVR(pmaddubsw);
439 OVR(pmaddwd);
440 OVR(pmovsxbw);
441 OVR(pmovzxbw);
442 OVR(pmulhrsw);
443 OVR(pmulhuw);
444 OVR(pmulhw);
445 OVR(pmullw);
446 OVR(psadbw);
447 OVR(pshufb);
448 OVR(pshufhw);
449 OVR(pshuflw);
450 OVR(pslldq);
451 OVR(psrldq);
452 OVR(punpckhbw);
453 OVR(punpckhwd);
454 OVR(punpcklbw);
455 OVR(punpcklwd);
456 #  endif
457 # endif
458 
459 # ifdef __AVX512DQ__
460 OVR_VFP(and);
461 OVR_VFP(andn);
462 OVR_VFP(or);
463 OVR(pextrd);
464 OVR(pextrq);
465 OVR(pinsrd);
466 OVR(pinsrq);
467 #  ifdef __AVX512VL__
468 OVR(pmullq);
469 #  endif
470 OVR_VFP(xor);
471 # endif
472 
473 # undef OVR_VFP
474 # undef OVR_SFP
475 # undef OVR_INT
476 # undef OVR_FP
477 # undef OVR_FMA
478 # undef OVR_DQ
479 # undef OVR_BW
480 # undef OVR
481 
482 #endif /* __AVX512F__ */
483 
484 /*
485  * Suppress value propagation by the compiler, preventing unwanted
486  * optimization. This at once makes the compiler use memory operands
487  * more often, which for our purposes is the more interesting case.
488  */
489 #define touch(var) asm volatile ( "" : "+m" (var) )
490 
undef(void)491 static inline vec_t undef(void)
492 {
493     vec_t v = v;
494     return v;
495 }
496