#include <stdbool.h>

asm (
    "\t.text\n"
    "\t.globl _start\n"
    "_start:\n"
#if defined(__i386__) && VEC_SIZE == 16
    "\tpush %ebp\n"
    "\tmov %esp,%ebp\n"
    "\tand $~0xf,%esp\n"
    "\tcall simd_test\n"
    "\tleave\n"
    "\tret"
#else
    "\tjmp simd_test"
#endif
    );

typedef
#if defined(INT_SIZE)
# define ELEM_SIZE INT_SIZE
signed int
# if INT_SIZE == 1
#  define MODE QI
# elif INT_SIZE == 2
#  define MODE HI
# elif INT_SIZE == 4
#  define MODE SI
# elif INT_SIZE == 8
#  define MODE DI
# endif
#elif defined(UINT_SIZE)
# define ELEM_SIZE UINT_SIZE
unsigned int
# if UINT_SIZE == 1
#  define MODE QI
# elif UINT_SIZE == 2
#  define MODE HI
# elif UINT_SIZE == 4
#  define MODE SI
# elif UINT_SIZE == 8
#  define MODE DI
# endif
#elif defined(FLOAT_SIZE)
float
# define ELEM_SIZE FLOAT_SIZE
# if FLOAT_SIZE == 4
#  define MODE SF
# elif FLOAT_SIZE == 8
#  define MODE DF
# endif
#endif
#ifndef VEC_SIZE
# define VEC_SIZE ELEM_SIZE
#endif
__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;

#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)

typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;

/* Various builtins want plain char / int / long long vector types ... */
typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
#if VEC_SIZE >= 8
typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
#endif

#if VEC_SIZE == 8 && defined(__SSE__)
# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
#elif VEC_SIZE == 16
# if defined(__AVX__) && defined(FLOAT_SIZE)
#  if ELEM_SIZE == 4
#   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
#  elif ELEM_SIZE == 8
#   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
#  endif
# elif defined(__SSE4_1__)
#  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
# elif defined(__SSE__) && ELEM_SIZE == 4
#  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
# elif defined(__SSE2__)
#  if ELEM_SIZE == 8
#   define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
#  else
#   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
#  endif
# endif
#elif VEC_SIZE == 32
# if defined(__AVX__) && ELEM_SIZE == 4
#  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
# elif defined(__AVX__) && ELEM_SIZE == 8
#  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
# endif
#endif

#ifndef to_bool
static inline bool _to_bool(byte_vec_t bv)
{
    unsigned int i;

    for ( i = 0; i < VEC_SIZE; ++i )
        if ( bv[i] != 0xff )
            return false;

    return true;
}
# define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
#endif

#if VEC_SIZE == FLOAT_SIZE
# define to_int(x) ((vec_t){ (int)(x)[0] })
#elif VEC_SIZE == 16 && defined(__SSE2__)
# if FLOAT_SIZE == 4
#  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
# elif FLOAT_SIZE == 8
#  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
# endif
#elif VEC_SIZE == 32 && defined(__AVX__)
# if FLOAT_SIZE == 4
#  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
# elif FLOAT_SIZE == 8
#  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
# endif
#endif

#if VEC_SIZE == FLOAT_SIZE
# define scalar_1op(x, op) ({ \
    typeof((x)[0]) __attribute__((vector_size(16))) r_; \
    asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
    (vec_t){ r_[0] }; \
})
#endif

#if FLOAT_SIZE == 4 && defined(__SSE__)
# if VEC_SIZE == 32 && defined(__AVX__)
#  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
#  define max(x, y) __builtin_ia32_maxps256(x, y)
#  define min(x, y) __builtin_ia32_minps256(x, y)
#  define recip(x) __builtin_ia32_rcpps256(x)
#  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
#  define sqrt(x) __builtin_ia32_sqrtps256(x)
#  define swap(x) ({ \
    vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
})
#  define swap2(x) ({ \
    vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
    __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
})
# elif VEC_SIZE == 16
#  ifdef __AVX__
#   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
#  endif
#  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
#  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
#  define max(x, y) __builtin_ia32_maxps(x, y)
#  define min(x, y) __builtin_ia32_minps(x, y)
#  define recip(x) __builtin_ia32_rcpps(x)
#  define rsqrt(x) __builtin_ia32_rsqrtps(x)
#  define sqrt(x) __builtin_ia32_sqrtps(x)
#  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
#  ifdef __AVX__
#   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
#  endif
# elif VEC_SIZE == 4
#  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
#  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
#  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
# endif
#elif FLOAT_SIZE == 8 && defined(__SSE2__)
# if VEC_SIZE == 32 && defined(__AVX__)
#  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
#  define max(x, y) __builtin_ia32_maxpd256(x, y)
#  define min(x, y) __builtin_ia32_minpd256(x, y)
#  define recip(x) ({ \
    float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
    t_ = __builtin_ia32_vextractf128_ps256( \
             __builtin_ia32_rcpps256( \
                 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
    __builtin_ia32_cvtps2pd256(t_); \
})
#  define rsqrt(x) ({ \
    float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
    float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
    t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
    t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
    __builtin_ia32_cvtps2pd256(t1_); \
})
#  define sqrt(x) __builtin_ia32_sqrtpd256(x)
#  define swap(x) ({ \
    vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
    __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
})
# elif VEC_SIZE == 16
#  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
#  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
#  define max(x, y) __builtin_ia32_maxpd(x, y)
#  define min(x, y) __builtin_ia32_minpd(x, y)
#  define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
#  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
#  define sqrt(x) __builtin_ia32_sqrtpd(x)
#  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
#  ifdef __AVX__
#   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
                                                       __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
#  endif
# elif VEC_SIZE == 8
#  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
#  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
#  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
# endif
#endif
#if VEC_SIZE == 16 && defined(__SSE2__)
# if INT_SIZE == 1 || UINT_SIZE == 1
#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
# elif INT_SIZE == 2 || UINT_SIZE == 2
#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)(x), (vhi_t)(y)))
#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)(x), (vhi_t)(y)))
#  define swap(x) ((vec_t)__builtin_ia32_pshufd( \
                   (vsi_t)__builtin_ia32_pshufhw( \
                          __builtin_ia32_pshuflw((vhi_t)(x), 0b00011011), 0b00011011), 0b01001110))
# elif INT_SIZE == 4 || UINT_SIZE == 4
#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)(x), (vsi_t)(y)))
#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)(x), (vsi_t)(y)))
#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b00011011))
# elif INT_SIZE == 8 || UINT_SIZE == 8
#  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)(x), (vdi_t)(y)))
#  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)(x), (vdi_t)(y)))
#  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b01001110))
# endif
# if UINT_SIZE == 1
#  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)(x), (vqi_t)(y)))
#  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)(x), (vqi_t)(y)))
# elif INT_SIZE == 2
#  define max(x, y) __builtin_ia32_pmaxsw128(x, y)
#  define min(x, y) __builtin_ia32_pminsw128(x, y)
#  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
# elif UINT_SIZE == 2
#  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)(x), (vhi_t)(y)))
# elif UINT_SIZE == 4
#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)(x), (vsi_t)(y)))
# endif
# define select(d, x, y, m) ({ \
    void *d_ = (d); \
    vqi_t m_ = (vqi_t)(m); \
    __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
    __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
})
#endif
#if VEC_SIZE == 16 && defined(__SSE3__)
# if FLOAT_SIZE == 4
#  define addsub(x, y) __builtin_ia32_addsubps(x, y)
#  define dup_hi(x) __builtin_ia32_movshdup(x)
#  define dup_lo(x) __builtin_ia32_movsldup(x)
#  define hadd(x, y) __builtin_ia32_haddps(x, y)
#  define hsub(x, y) __builtin_ia32_hsubps(x, y)
# elif FLOAT_SIZE == 8
#  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
#  define dup_lo(x) ({ \
    double __attribute__((vector_size(16))) r_; \
    asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
    r_; \
})
#  define hadd(x, y) __builtin_ia32_haddpd(x, y)
#  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
# endif
#elif VEC_SIZE == 32 && defined(__AVX__)
# if FLOAT_SIZE == 4
#  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
#  define dup_hi(x) __builtin_ia32_movshdup256(x)
#  define dup_lo(x) __builtin_ia32_movsldup256(x)
#  define hadd(x, y) ({ \
        vec_t t_ = __builtin_ia32_haddps256(x, y); \
        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
})
#  define hsub(x, y) ({ \
        vec_t t_ = __builtin_ia32_hsubps256(x, y); \
        (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
})
# elif FLOAT_SIZE == 8
#  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
#  define dup_lo(x) __builtin_ia32_movddup256(x)
#  define hadd(x, y) ({ \
        vec_t t_ = __builtin_ia32_haddpd256(x, y); \
        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
})
#  define hsub(x, y) ({ \
        vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
        (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
})
# endif
#endif
#if VEC_SIZE == 16 && defined(__SSSE3__)
# if INT_SIZE == 1
#  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
# elif INT_SIZE == 2
#  define abs(x) __builtin_ia32_pabsw128(x)
# elif INT_SIZE == 4
#  define abs(x) __builtin_ia32_pabsd128(x)
# endif
# if INT_SIZE == 1 || UINT_SIZE == 1
#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
#  define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
# elif INT_SIZE == 2 || UINT_SIZE == 2
#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
#  define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
#  define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
# elif INT_SIZE == 4 || UINT_SIZE == 4
#  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
#  define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
#  define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
# elif INT_SIZE == 8 || UINT_SIZE == 8
#  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
# endif
#endif
#if VEC_SIZE == 16 && defined(__SSE4_1__)
# if INT_SIZE == 1
#  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
#  define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
#  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
# elif INT_SIZE == 2
#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
#  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
# elif INT_SIZE == 4
#  define max(x, y) __builtin_ia32_pmaxsd128(x, y)
#  define min(x, y) __builtin_ia32_pminsd128(x, y)
#  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
#  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
# elif UINT_SIZE == 1
#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
#  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
# elif UINT_SIZE == 2
#  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
#  define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
#  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
# elif UINT_SIZE == 4
#  define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
#  define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
#  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
# endif
# undef select
# if defined(INT_SIZE) || defined(UINT_SIZE)
#  define select(d, x, y, m) \
    (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
# elif FLOAT_SIZE == 4
#  define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
#  define trunc(x) __builtin_ia32_roundps(x, 0b1011)
# elif FLOAT_SIZE == 8
#  define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
#  define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
# endif
# if INT_SIZE == 2 || UINT_SIZE == 2
#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
# elif INT_SIZE == 4 || UINT_SIZE == 4
#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
# elif INT_SIZE == 8 || UINT_SIZE == 8
#  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
# elif FLOAT_SIZE == 4
#  define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
# elif FLOAT_SIZE == 8
#  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
# endif
#endif
#if VEC_SIZE == 32 && defined(__AVX__)
# if FLOAT_SIZE == 4
#  define dot_product(x, y) ({ \
    vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
    (vec_t){t_[0] + t_[4]}; \
})
#  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
#  define select2(d, x, y, m) ({ \
    vsi_t m_ = (vsi_t)(m); \
    *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
    __builtin_ia32_maskstoreps256(d, ~m_, y); \
})
#  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
# elif FLOAT_SIZE == 8
#  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
#  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
#  define select2(d, x, y, m) ({ \
    vdi_t m_ = (vdi_t)(m); \
    *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
    __builtin_ia32_maskstorepd256(d, ~m_, y); \
})
#  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
# endif
#endif
#if VEC_SIZE == FLOAT_SIZE
# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
# ifdef __SSE4_1__
#  if FLOAT_SIZE == 4
#   define trunc(x) ({ \
    float __attribute__((vector_size(16))) r_; \
    asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
    (vec_t){ r_[0] }; \
})
#  elif FLOAT_SIZE == 8
#   define trunc(x) ({ \
    double __attribute__((vector_size(16))) r_; \
    asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
    (vec_t){ r_[0] }; \
})
#  endif
# endif
#endif

/*
 * Suppress value propagation by the compiler, preventing unwanted
 * optimization. This at once makes the compiler use memory operands
 * more often, which for our purposes is the more interesting case.
 */
#define touch(var) asm volatile ( "" : "+m" (var) )

int simd_test(void)
{
    unsigned int i, j;
    vec_t x, y, z, src, inv, alt, sh;

    for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
    {
        src[i] = i + 1;
        inv[i] = ELEM_COUNT - i;
#ifdef UINT_SIZE
        alt[i] = -!(i & 1);
#else
        alt[i] = i & 1 ? -1 : 1;
#endif
        if ( !(i & (i + 1)) )
            --j;
        sh[i] = j;
    }

    touch(src);
    x = src;
    touch(x);
    if ( !to_bool(x == src) ) return __LINE__;

    touch(src);
    y = x + src;
    touch(src);
    touch(y);
    if ( !to_bool(y == 2 * src) ) return __LINE__;

    touch(src);
    z = y -= src;
    touch(z);
    if ( !to_bool(x == z) ) return __LINE__;

#if defined(UINT_SIZE)

    touch(inv);
    x |= inv;
    touch(inv);
    y &= inv;
    touch(inv);
    z ^= inv;
    touch(inv);
    touch(x);
    if ( !to_bool((x & ~y) == z) ) return __LINE__;

#elif ELEM_SIZE > 1 || VEC_SIZE <= 8

    touch(src);
    x *= src;
    y = inv * inv;
    touch(src);
    z = src + inv;
    touch(inv);
    z *= (src - inv);
    if ( !to_bool(x - y == z) ) return __LINE__;

#endif

#if defined(FLOAT_SIZE)

    x = src * alt;
    touch(alt);
    y = src / alt;
    if ( !to_bool(x == y) ) return __LINE__;
    touch(alt);
    touch(src);
    if ( !to_bool(x * -alt == -src) ) return __LINE__;

# if defined(recip) && defined(to_int)

    touch(src);
    x = recip(src);
    touch(src);
    touch(x);
    if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;

#  ifdef rsqrt
    x = src * src;
    touch(x);
    y = rsqrt(x);
    touch(y);
    if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
    touch(src);
    if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
#  endif

# endif

# ifdef sqrt
    x = src * src;
    touch(x);
    if ( !to_bool(sqrt(x) == src) ) return __LINE__;
# endif

# ifdef trunc
    x = 1 / src;
    y = (vec_t){ 1 };
    touch(x);
    z = trunc(x);
    if ( !to_bool(y == z) ) return __LINE__;
# endif

#else

# if ELEM_SIZE > 1

    touch(inv);
    x = src * inv;
    touch(inv);
    y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
    for ( i = 1; i < ELEM_COUNT / 2; ++i )
        y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
    if ( !to_bool(x == y) ) return __LINE__;

#  ifdef mul_hi
    touch(alt);
    x = mul_hi(src, alt);
    touch(alt);
#   ifdef INT_SIZE
    if ( !to_bool(x == (alt < 0)) ) return __LINE__;
#   else
    if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
#   endif
#  endif

#  ifdef mul_full
    x = src ^ alt;
    touch(inv);
    y = mul_full(x, inv);
    touch(inv);
    for ( i = 0; i < ELEM_COUNT; i += 2 )
    {
        unsigned long long res = x[i] * 1ULL * inv[i];

        z[i] = res;
        z[i + 1] = res >> (ELEM_SIZE << 3);
    }
    if ( !to_bool(y == z) ) return __LINE__;
#  endif

    z = src;
#  ifdef INT_SIZE
    z *= alt;
#  endif
    touch(z);
    x = z << 3;
    touch(z);
    y = z << 2;
    touch(z);
    if ( !to_bool(x == y + y) ) return __LINE__;

    touch(x);
    z = x >> 2;
    touch(x);
    if ( !to_bool(y == z + z) ) return __LINE__;

    z = src;
#  ifdef INT_SIZE
    z *= alt;
#  endif
    /*
     * Note that despite the touch()-es here there doesn't appear to be a way
     * to make the compiler use a memory operand for the shift instruction (at
     * least without resorting to built-ins).
     */
    j = 3;
    touch(j);
    x = z << j;
    touch(j);
    j = 2;
    touch(j);
    y = z << j;
    touch(j);
    if ( !to_bool(x == y + y) ) return __LINE__;

    z = x >> j;
    touch(j);
    if ( !to_bool(y == z + z) ) return __LINE__;

# endif

# if ELEM_SIZE == 2 || defined(__SSE4_1__)
    /*
     * Even when there are no instructions with varying shift counts per
     * field, the code turns out to be a nice exercise for pextr/pinsr.
     */
    z = src;
#  ifdef INT_SIZE
    z *= alt;
#  endif
    /*
     * Zap elements for which the shift count is negative (and the hence the
     * decrement below would yield a negative count.
     */
    z &= (sh > 0);
    touch(sh);
    x = z << sh;
    touch(sh);
    --sh;
    touch(sh);
    y = z << sh;
    touch(sh);
    if ( !to_bool(x == y + y) ) return __LINE__;

# endif

#endif

#if defined(max) && defined(min)
# ifdef UINT_SIZE
    touch(inv);
    x = min(src, inv);
    touch(inv);
    y = max(src, inv);
    touch(inv);
    if ( !to_bool(x + y == src + inv) ) return __LINE__;
# else
    x = src * alt;
    y = inv * alt;
    touch(y);
    z = max(x, y);
    touch(y);
    y = min(x, y);
    touch(y);
    if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
# endif
#endif

#ifdef abs
    x = src * alt;
    touch(x);
    if ( !to_bool(abs(x) == src) ) return __LINE__;
#endif

#ifdef copysignz
    touch(alt);
    if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
#endif

#ifdef swap
    touch(src);
    if ( !to_bool(swap(src) == inv) ) return __LINE__;
#endif

#ifdef swap2
    touch(src);
    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
#endif

#if defined(broadcast)
    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
#endif

#if defined(interleave_lo) && defined(interleave_hi)
    touch(src);
    x = interleave_lo(inv, src);
    touch(src);
    y = interleave_hi(inv, src);
    touch(src);
# ifdef UINT_SIZE
    z = ((x - y) ^ ~alt) - ~alt;
# else
    z = (x - y) * alt;
# endif
    if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
#endif

#if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)

    x = src * alt;
    y = interleave_lo(x, alt < 0);
    touch(x);
    z = widen1(x);
    touch(x);
    if ( !to_bool(z == y) ) return __LINE__;

# ifdef widen2
    y = interleave_lo(alt < 0, alt < 0);
    y = interleave_lo(z, y);
    touch(x);
    z = widen2(x);
    touch(x);
    if ( !to_bool(z == y) ) return __LINE__;

#  ifdef widen3
    y = interleave_lo(alt < 0, alt < 0);
    y = interleave_lo(y, y);
    y = interleave_lo(z, y);
    touch(x);
    z = widen3(x);
    touch(x);
    if ( !to_bool(z == y) ) return __LINE__;
#  endif
# endif

#endif

#if defined(UINT_SIZE) && defined(interleave_lo)

    y = interleave_lo(src, (vec_t){});
    z = interleave_lo(y, (vec_t){});

# ifdef widen1
    touch(src);
    x = widen1(src);
    touch(src);
    if ( !to_bool(x == y) ) return __LINE__;
# endif

# ifdef widen2
    touch(src);
    x = widen2(src);
    touch(src);
    if ( !to_bool(x == z) ) return __LINE__;
# endif

# ifdef widen3
    touch(src);
    x = widen3(src);
    touch(src);
    if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
# endif

#endif

#ifdef dup_lo
    touch(src);
    x = dup_lo(src);
    touch(src);
    if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
#endif

#ifdef dup_hi
    touch(src);
    x = dup_hi(src);
    touch(src);
    if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
#endif

    for ( i = 0; i < ELEM_COUNT; ++i )
        y[i] = (i & 1 ? inv : src)[i];

#ifdef select
# ifdef UINT_SIZE
    select(&z, src, inv, alt);
# else
    select(&z, src, inv, alt > 0);
# endif
    if ( !to_bool(z == y) ) return __LINE__;
#endif

#ifdef select2
# ifdef UINT_SIZE
    select2(&z, src, inv, alt);
# else
    select2(&z, src, inv, alt > 0);
# endif
    if ( !to_bool(z == y) ) return __LINE__;
#endif

#ifdef mix
    touch(src);
    touch(inv);
    x = mix(src, inv);
    if ( !to_bool(x == y) ) return __LINE__;

# ifdef addsub
    touch(src);
    touch(inv);
    x = addsub(src, inv);
    touch(src);
    touch(inv);
    y = mix(src - inv, src + inv);
    if ( !to_bool(x == y) ) return __LINE__;
# endif
#endif

#ifdef rotr
    x = rotr(src, 1);
    y = (src & (ELEM_COUNT - 1)) + 1;
    if ( !to_bool(x == y) ) return __LINE__;
#endif

#ifdef dot_product
    touch(src);
    touch(inv);
    x = dot_product(src, inv);
    if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
                                 (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
#endif

#ifdef hadd
    x = src;
    for ( i = ELEM_COUNT; i >>= 1; )
    {
        touch(x);
        x = hadd((vec_t){}, x);
    }
    if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;

# ifdef hsub
    touch(src);
    touch(inv);
    x = hsub(src, inv);
    for ( i = ELEM_COUNT; i >>= 1; )
        x = hadd(x, (vec_t){});
    if ( !to_bool(x == 0) ) return __LINE__;
# endif
#endif


    return 0;
}