1 // Simd NEON specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 27 28 #if __cplusplus >= 201703L 29 30 #if !_GLIBCXX_SIMD_HAVE_NEON 31 #error "simd_neon.h may only be included when NEON on ARM is available" 32 #endif 33 34 _GLIBCXX_SIMD_BEGIN_NAMESPACE 35 36 // _CommonImplNeon {{{ 37 struct _CommonImplNeon : _CommonImplBuiltin 38 { 39 // _S_store {{{ 40 using _CommonImplBuiltin::_S_store; 41 42 // }}} 43 }; 44 45 // }}} 46 // _SimdImplNeon {{{ 47 template <typename _Abi> 48 struct _SimdImplNeon : _SimdImplBuiltin<_Abi> 49 { 50 using _Base = _SimdImplBuiltin<_Abi>; 51 52 template <typename _Tp> 53 using _MaskMember = typename _Base::template _MaskMember<_Tp>; 54 55 template <typename _Tp> 56 static constexpr size_t _S_max_store_size = 16; 57 58 // _S_masked_load {{{ 59 template <typename _Tp, size_t _Np, typename _Up> 60 static inline _SimdWrapper<_Tp, _Np> _S_masked_load_SimdImplNeon61 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 62 const _Up* __mem) noexcept 63 { 64 __execute_n_times<_Np>([&](auto __i) { 65 if (__k[__i] != 0) 66 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 67 }); 68 return __merge; 69 } 70 71 // }}} 72 // _S_masked_store_nocvt {{{ 73 template <typename _Tp, size_t _Np> 74 _GLIBCXX_SIMD_INTRINSIC static void _S_masked_store_nocvt_SimdImplNeon75 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 76 _MaskMember<_Tp> __k) 77 { 78 __execute_n_times<_Np>([&](auto __i) { 79 if (__k[__i] != 0) 80 __mem[__i] = __v[__i]; 81 }); 82 } 83 84 // }}} 85 // _S_reduce {{{ 86 template <typename _Tp, typename _BinaryOperation> 87 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_reduce_SimdImplNeon88 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 89 { 90 constexpr size_t _Np = __x.size(); 91 if constexpr (sizeof(__x) == 16 && _Np >= 4 92 && !_Abi::template _S_is_partial<_Tp>) 93 { 94 const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x); 95 const auto __y = __binary_op(__halves[0], __halves[1]); 96 return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce( 97 __y, static_cast<_BinaryOperation&&>(__binary_op)); 98 } 99 else if constexpr (_Np == 8) 100 { 101 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 102 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>( 103 __x._M_data))); 104 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 105 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>( 106 __x._M_data))); 107 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 108 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>( 109 __x._M_data))); 110 return __x[0]; 111 } 112 else if constexpr (_Np == 4) 113 { 114 __x 115 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 116 __vector_permute<1, 0, 3, 2>(__x._M_data))); 117 __x 118 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 119 __vector_permute<3, 2, 1, 0>(__x._M_data))); 120 return __x[0]; 121 } 122 else if constexpr (_Np == 2) 123 { 124 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 125 __vector_permute<1, 0>(__x._M_data))); 126 return __x[0]; 127 } 128 else 129 return _Base::_S_reduce(__x, 130 static_cast<_BinaryOperation&&>(__binary_op)); 131 } 132 133 // }}} 134 // math {{{ 135 // _S_sqrt {{{ 136 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> _S_sqrt_SimdImplNeon137 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x) 138 { 139 if constexpr (__have_neon_a64) 140 { 141 const auto __intrin = __to_intrin(__x); 142 if constexpr (_TVT::template _S_is<float, 2>) 143 return vsqrt_f32(__intrin); 144 else if constexpr (_TVT::template _S_is<float, 4>) 145 return vsqrtq_f32(__intrin); 146 else if constexpr (_TVT::template _S_is<double, 1>) 147 return vsqrt_f64(__intrin); 148 else if constexpr (_TVT::template _S_is<double, 2>) 149 return vsqrtq_f64(__intrin); 150 else 151 __assert_unreachable<_Tp>(); 152 } 153 else 154 return _Base::_S_sqrt(__x); 155 } 156 157 // }}} 158 // _S_trunc {{{ 159 template <typename _TW, typename _TVT = _VectorTraits<_TW>> _S_trunc_SimdImplNeon160 _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x) 161 { 162 using _Tp = typename _TVT::value_type; 163 if constexpr (__have_neon_a32) 164 { 165 const auto __intrin = __to_intrin(__x); 166 if constexpr (_TVT::template _S_is<float, 2>) 167 return vrnd_f32(__intrin); 168 else if constexpr (_TVT::template _S_is<float, 4>) 169 return vrndq_f32(__intrin); 170 else if constexpr (_TVT::template _S_is<double, 1>) 171 return vrnd_f64(__intrin); 172 else if constexpr (_TVT::template _S_is<double, 2>) 173 return vrndq_f64(__intrin); 174 else 175 __assert_unreachable<_Tp>(); 176 } 177 else if constexpr (is_same_v<_Tp, float>) 178 { 179 auto __intrin = __to_intrin(__x); 180 if constexpr (sizeof(__x) == 16) 181 __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin)); 182 else 183 __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin)); 184 return _Base::_S_abs(__x)._M_data < 0x1p23f 185 ? __vector_bitcast<float>(__intrin) 186 : __x._M_data; 187 } 188 else 189 return _Base::_S_trunc(__x); 190 } 191 192 // }}} 193 // _S_round {{{ 194 template <typename _Tp, size_t _Np> 195 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_round_SimdImplNeon196 _S_round(_SimdWrapper<_Tp, _Np> __x) 197 { 198 if constexpr (__have_neon_a32) 199 { 200 const auto __intrin = __to_intrin(__x); 201 if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8) 202 return vrnda_f32(__intrin); 203 else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16) 204 return vrndaq_f32(__intrin); 205 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8) 206 return vrnda_f64(__intrin); 207 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16) 208 return vrndaq_f64(__intrin); 209 else 210 __assert_unreachable<_Tp>(); 211 } 212 else 213 return _Base::_S_round(__x); 214 } 215 216 // }}} 217 // _S_floor {{{ 218 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> _S_floor_SimdImplNeon219 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x) 220 { 221 if constexpr (__have_neon_a32) 222 { 223 const auto __intrin = __to_intrin(__x); 224 if constexpr (_TVT::template _S_is<float, 2>) 225 return vrndm_f32(__intrin); 226 else if constexpr (_TVT::template _S_is<float, 4>) 227 return vrndmq_f32(__intrin); 228 else if constexpr (_TVT::template _S_is<double, 1>) 229 return vrndm_f64(__intrin); 230 else if constexpr (_TVT::template _S_is<double, 2>) 231 return vrndmq_f64(__intrin); 232 else 233 __assert_unreachable<_Tp>(); 234 } 235 else 236 return _Base::_S_floor(__x); 237 } 238 239 // }}} 240 // _S_ceil {{{ 241 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> _S_ceil_SimdImplNeon242 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x) 243 { 244 if constexpr (__have_neon_a32) 245 { 246 const auto __intrin = __to_intrin(__x); 247 if constexpr (_TVT::template _S_is<float, 2>) 248 return vrndp_f32(__intrin); 249 else if constexpr (_TVT::template _S_is<float, 4>) 250 return vrndpq_f32(__intrin); 251 else if constexpr (_TVT::template _S_is<double, 1>) 252 return vrndp_f64(__intrin); 253 else if constexpr (_TVT::template _S_is<double, 2>) 254 return vrndpq_f64(__intrin); 255 else 256 __assert_unreachable<_Tp>(); 257 } 258 else 259 return _Base::_S_ceil(__x); 260 } 261 262 //}}} }}} 263 }; // }}} 264 // _MaskImplNeonMixin {{{ 265 struct _MaskImplNeonMixin 266 { 267 using _Base = _MaskImplBuiltinMixin; 268 269 template <typename _Tp, size_t _Np> 270 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> _S_to_bits_MaskImplNeonMixin271 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 272 { 273 if (__builtin_is_constant_evaluated()) 274 return _Base::_S_to_bits(__x); 275 276 using _I = __int_for_sizeof_t<_Tp>; 277 if constexpr (sizeof(__x) == 16) 278 { 279 auto __asint = __vector_bitcast<_I>(__x); 280 #ifdef __aarch64__ 281 [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); 282 #else 283 [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))(); 284 #endif 285 if constexpr (sizeof(_Tp) == 1) 286 { 287 constexpr auto __bitsel 288 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>( 289 [&](auto __i) { 290 return static_cast<_I>( 291 __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0); 292 }); 293 __asint &= __bitsel; 294 #ifdef __aarch64__ 295 return __vector_bitcast<_UShort>( 296 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero), 297 __zero))[0]; 298 #else 299 return __vector_bitcast<_UShort>( 300 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)), 301 __zero), 302 __zero))[0]; 303 #endif 304 } 305 else if constexpr (sizeof(_Tp) == 2) 306 { 307 constexpr auto __bitsel 308 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( 309 [&](auto __i) { 310 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 311 }); 312 __asint &= __bitsel; 313 #ifdef __aarch64__ 314 return vaddvq_s16(__asint); 315 #else 316 return vpadd_s16( 317 vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero), 318 __zero)[0]; 319 #endif 320 } 321 else if constexpr (sizeof(_Tp) == 4) 322 { 323 constexpr auto __bitsel 324 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( 325 [&](auto __i) { 326 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 327 }); 328 __asint &= __bitsel; 329 #ifdef __aarch64__ 330 return vaddvq_s32(__asint); 331 #else 332 return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)), 333 __zero)[0]; 334 #endif 335 } 336 else if constexpr (sizeof(_Tp) == 8) 337 return (__asint[0] & 1) | (__asint[1] & 2); 338 else 339 __assert_unreachable<_Tp>(); 340 } 341 else if constexpr (sizeof(__x) == 8) 342 { 343 auto __asint = __vector_bitcast<_I>(__x); 344 [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); 345 if constexpr (sizeof(_Tp) == 1) 346 { 347 constexpr auto __bitsel 348 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( 349 [&](auto __i) { 350 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 351 }); 352 __asint &= __bitsel; 353 #ifdef __aarch64__ 354 return vaddv_s8(__asint); 355 #else 356 return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero), 357 __zero)[0]; 358 #endif 359 } 360 else if constexpr (sizeof(_Tp) == 2) 361 { 362 constexpr auto __bitsel 363 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( 364 [&](auto __i) { 365 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 366 }); 367 __asint &= __bitsel; 368 #ifdef __aarch64__ 369 return vaddv_s16(__asint); 370 #else 371 return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0]; 372 #endif 373 } 374 else if constexpr (sizeof(_Tp) == 4) 375 { 376 __asint &= __make_vector<_I>(0x1, 0x2); 377 #ifdef __aarch64__ 378 return vaddv_s32(__asint); 379 #else 380 return vpadd_s32(__asint, __zero)[0]; 381 #endif 382 } 383 else 384 __assert_unreachable<_Tp>(); 385 } 386 else 387 return _Base::_S_to_bits(__x); 388 } 389 }; 390 391 // }}} 392 // _MaskImplNeon {{{ 393 template <typename _Abi> 394 struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi> 395 { 396 using _MaskImplBuiltinMixin::_S_to_maskvector; 397 using _MaskImplNeonMixin::_S_to_bits; 398 using _Base = _MaskImplBuiltin<_Abi>; 399 using _Base::_S_convert; 400 401 // _S_all_of {{{ 402 template <typename _Tp> _S_all_of_MaskImplNeon403 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k) 404 { 405 const auto __kk 406 = __vector_bitcast<char>(__k._M_data) 407 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>()); 408 if constexpr (sizeof(__k) == 16) 409 { 410 const auto __x = __vector_bitcast<long long>(__kk); 411 return __x[0] + __x[1] == -2; 412 } 413 else if constexpr (sizeof(__k) <= 8) 414 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1; 415 else 416 __assert_unreachable<_Tp>(); 417 } 418 419 // }}} 420 // _S_any_of {{{ 421 template <typename _Tp> _S_any_of_MaskImplNeon422 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k) 423 { 424 const auto __kk 425 = __vector_bitcast<char>(__k._M_data) 426 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>()); 427 if constexpr (sizeof(__k) == 16) 428 { 429 const auto __x = __vector_bitcast<long long>(__kk); 430 return (__x[0] | __x[1]) != 0; 431 } 432 else if constexpr (sizeof(__k) <= 8) 433 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0; 434 else 435 __assert_unreachable<_Tp>(); 436 } 437 438 // }}} 439 // _S_none_of {{{ 440 template <typename _Tp> _S_none_of_MaskImplNeon441 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k) 442 { 443 const auto __kk = _Abi::_S_masked(__k._M_data); 444 if constexpr (sizeof(__k) == 16) 445 { 446 const auto __x = __vector_bitcast<long long>(__kk); 447 return (__x[0] | __x[1]) == 0; 448 } 449 else if constexpr (sizeof(__k) <= 8) 450 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0; 451 else 452 __assert_unreachable<_Tp>(); 453 } 454 455 // }}} 456 // _S_some_of {{{ 457 template <typename _Tp> _S_some_of_MaskImplNeon458 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k) 459 { 460 if constexpr (sizeof(__k) <= 8) 461 { 462 const auto __kk = __vector_bitcast<char>(__k._M_data) 463 | ~__vector_bitcast<char>( 464 _Abi::template _S_implicit_mask<_Tp>()); 465 using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>; 466 return __bit_cast<_Up>(__kk) + 1 > 1; 467 } 468 else 469 return _Base::_S_some_of(__k); 470 } 471 472 // }}} 473 // _S_popcount {{{ 474 template <typename _Tp> _S_popcount_MaskImplNeon475 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k) 476 { 477 if constexpr (sizeof(_Tp) == 1) 478 { 479 const auto __s8 = __vector_bitcast<_SChar>(__k._M_data); 480 int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8); 481 return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()), 482 int8x8_t())[0]; 483 } 484 else if constexpr (sizeof(_Tp) == 2) 485 { 486 const auto __s16 = __vector_bitcast<short>(__k._M_data); 487 int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16); 488 return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0]; 489 } 490 else if constexpr (sizeof(_Tp) == 4) 491 { 492 const auto __s32 = __vector_bitcast<int>(__k._M_data); 493 int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32); 494 return -vpadd_s32(__tmp, int32x2_t())[0]; 495 } 496 else if constexpr (sizeof(_Tp) == 8) 497 { 498 static_assert(sizeof(__k) == 16); 499 const auto __s64 = __vector_bitcast<long>(__k._M_data); 500 return -(__s64[0] + __s64[1]); 501 } 502 } 503 504 // }}} 505 // _S_find_first_set {{{ 506 template <typename _Tp> 507 _GLIBCXX_SIMD_INTRINSIC static int _S_find_first_set_MaskImplNeon508 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 509 { 510 // TODO: the _Base implementation is not optimal for NEON 511 return _Base::_S_find_first_set(__k); 512 } 513 514 // }}} 515 // _S_find_last_set {{{ 516 template <typename _Tp> 517 _GLIBCXX_SIMD_INTRINSIC static int _S_find_last_set_MaskImplNeon518 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 519 { 520 // TODO: the _Base implementation is not optimal for NEON 521 return _Base::_S_find_last_set(__k); 522 } 523 524 // }}} 525 }; // }}} 526 527 _GLIBCXX_SIMD_END_NAMESPACE 528 #endif // __cplusplus >= 201703L 529 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 530 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 531