| @@ -1,14 +1,14 @@ | | | @@ -1,14 +1,14 @@ |
1 | /* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */ | | 1 | /* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
| @@ -519,26 +519,60 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t | | | @@ -519,26 +519,60 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t |
519 | int32x4_t __tvsh = (__vsh); \ | | 519 | int32x4_t __tvsh = (__vsh); \ |
520 | uint8_t __tbits = (__bits); \ | | 520 | uint8_t __tbits = (__bits); \ |
521 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ | | 521 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ |
522 | 3,2,1,0); \ | | 522 | 3,2,1,0); \ |
523 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ | | 523 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ |
524 | 3,2,1,0); \ | | 524 | 3,2,1,0); \ |
525 | int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ | | 525 | int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ |
526 | 34); \ | | 526 | 34); \ |
527 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | | 527 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ |
528 | }) | | 528 | }) |
529 | #endif /* __LITTLE_ENDIAN__ */ | | 529 | #endif /* __LITTLE_ENDIAN__ */ |
530 | #endif | | 530 | #endif |
531 | | | 531 | |
| | | 532 | #if defined(__GNUC__) && !defined(__clang__) |
| | | 533 | _INTRINSATTR |
| | | 534 | static __inline uint32x4_t |
| | | 535 | vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) |
| | | 536 | { |
| | | 537 | #ifdef __aarch64__ |
| | | 538 | return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); |
| | | 539 | #else |
| | | 540 | return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, |
| | | 541 | (int32x4_t)__vsh, __bits); |
| | | 542 | #endif |
| | | 543 | } |
| | | 544 | #elif defined(__clang__) |
| | | 545 | #ifdef __LITTLE_ENDIAN__ |
| | | 546 | #define vsriq_n_u32(__vins, __vsh, __bits) \ |
| | | 547 | (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ |
| | | 548 | (int32x4_t)(__vsh), (__bits), 34) |
| | | 549 | #else |
| | | 550 | #define vsliq_n_s32(__vins, __vsh, __bits) ( \ |
| | | 551 | { \ |
| | | 552 | int32x4_t __tvins = (__vins); \ |
| | | 553 | int32x4_t __tvsh = (__vsh); \ |
| | | 554 | uint8_t __tbits = (__bits); \ |
| | | 555 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ |
| | | 556 | 3,2,1,0); \ |
| | | 557 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ |
| | | 558 | 3,2,1,0); \ |
| | | 559 | int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \ |
| | | 560 | 34); \ |
| | | 561 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ |
| | | 562 | }) |
| | | 563 | #endif |
| | | 564 | #endif |
| | | 565 | |
532 | _INTRINSATTR | | 566 | _INTRINSATTR |
533 | static __inline void | | 567 | static __inline void |
534 | vst1q_u32(uint32_t *__p32, uint32x4_t __v) | | 568 | vst1q_u32(uint32_t *__p32, uint32x4_t __v) |
535 | { | | 569 | { |
536 | #if defined(__GNUC__) && !defined(__clang__) | | 570 | #if defined(__GNUC__) && !defined(__clang__) |
537 | #ifdef __aarch64__ | | 571 | #ifdef __aarch64__ |
538 | __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; | | 572 | __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; |
539 | | | 573 | |
540 | __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); | | 574 | __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); |
541 | #else | | 575 | #else |
542 | __builtin_neon_si *__p = (__builtin_neon_si *)__p32; | | 576 | __builtin_neon_si *__p = (__builtin_neon_si *)__p32; |
543 | | | 577 | |
544 | __builtin_neon_vst1v4si(__p, (int32x4_t)__v); | | 578 | __builtin_neon_vst1v4si(__p, (int32x4_t)__v); |