| @@ -1,631 +1,665 @@ | | | @@ -1,631 +1,665 @@ |
1 | /* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */ | | 1 | /* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
15 | * | | 15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. | | 26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ | | 27 | */ |
28 | | | 28 | |
29 | #ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H | | 29 | #ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H |
30 | #define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H | | 30 | #define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H |
31 | | | 31 | |
32 | #if defined(__GNUC__) && !defined(__clang__) | | 32 | #if defined(__GNUC__) && !defined(__clang__) |
33 | | | 33 | |
34 | #define _INTRINSATTR \ | | 34 | #define _INTRINSATTR \ |
35 | __extension__ \ | | 35 | __extension__ \ |
36 | __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | | 36 | __attribute__((__always_inline__, __gnu_inline__, __artificial__)) |
37 | | | 37 | |
38 | #ifdef __aarch64__ | | 38 | #ifdef __aarch64__ |
39 | typedef __Int32x4_t int32x4_t; | | 39 | typedef __Int32x4_t int32x4_t; |
40 | typedef __Int64x2_t int64x2_t; | | 40 | typedef __Int64x2_t int64x2_t; |
41 | typedef __Int8x16_t int8x16_t; | | 41 | typedef __Int8x16_t int8x16_t; |
42 | typedef __Uint16x8_t uint16x8_t; | | 42 | typedef __Uint16x8_t uint16x8_t; |
43 | typedef __Uint32x4_t uint32x4_t; | | 43 | typedef __Uint32x4_t uint32x4_t; |
44 | typedef __Uint64x2_t uint64x2_t; | | 44 | typedef __Uint64x2_t uint64x2_t; |
45 | typedef __Uint8x16_t uint8x16_t; | | 45 | typedef __Uint8x16_t uint8x16_t; |
46 | #else | | 46 | #else |
47 | typedef __simd128_int32_t int32x4_t; | | 47 | typedef __simd128_int32_t int32x4_t; |
48 | typedef __simd128_int64_t int64x2_t; | | 48 | typedef __simd128_int64_t int64x2_t; |
49 | typedef __simd128_int8_t int8x16_t; | | 49 | typedef __simd128_int8_t int8x16_t; |
50 | typedef __simd128_uint16_t uint16x8_t; | | 50 | typedef __simd128_uint16_t uint16x8_t; |
51 | typedef __simd128_uint32_t uint32x4_t; | | 51 | typedef __simd128_uint32_t uint32x4_t; |
52 | typedef __simd128_uint64_t uint64x2_t; | | 52 | typedef __simd128_uint64_t uint64x2_t; |
53 | typedef __simd128_uint8_t uint8x16_t; | | 53 | typedef __simd128_uint8_t uint8x16_t; |
54 | | | 54 | |
55 | typedef __simd64_int8_t int8x8_t; | | 55 | typedef __simd64_int8_t int8x8_t; |
56 | typedef __simd64_uint8_t uint8x8_t; | | 56 | typedef __simd64_uint8_t uint8x8_t; |
57 | typedef __builtin_neon_udi uint64x1_t; | | 57 | typedef __builtin_neon_udi uint64x1_t; |
58 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | | 58 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; |
59 | #endif | | 59 | #endif |
60 | | | 60 | |
61 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | | 61 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) |
62 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | | 62 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) |
63 | #else | | 63 | #else |
64 | #define __neon_lane_index(__v, __i) __i | | 64 | #define __neon_lane_index(__v, __i) __i |
65 | #endif | | 65 | #endif |
66 | | | 66 | |
67 | #elif defined(__clang__) | | 67 | #elif defined(__clang__) |
68 | | | 68 | |
69 | #define _INTRINSATTR \ | | 69 | #define _INTRINSATTR \ |
70 | __attribute__((__always_inline__, __nodebug__)) | | 70 | __attribute__((__always_inline__, __nodebug__)) |
71 | | | 71 | |
72 | typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; | | 72 | typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; |
73 | typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; | | 73 | typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; |
74 | typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; | | 74 | typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; |
75 | | | 75 | |
76 | typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; | | 76 | typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; |
77 | typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; | | 77 | typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; |
78 | typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; | | 78 | typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; |
79 | typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; | | 79 | typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; |
80 | | | 80 | |
81 | typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; | | 81 | typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; |
82 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | | 82 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; |
83 | | | 83 | |
84 | #ifdef __LITTLE_ENDIAN__ | | 84 | #ifdef __LITTLE_ENDIAN__ |
85 | #define __neon_lane_index(__v, __i) __i | | 85 | #define __neon_lane_index(__v, __i) __i |
86 | #else | | 86 | #else |
87 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | | 87 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) |
88 | #endif | | 88 | #endif |
89 | | | 89 | |
90 | #else | | 90 | #else |
91 | | | 91 | |
92 | #error Teach me how to neon in your compile! | | 92 | #error Teach me how to neon in your compile! |
93 | | | 93 | |
94 | #endif | | 94 | #endif |
95 | | | 95 | |
96 | _INTRINSATTR | | 96 | _INTRINSATTR |
97 | static __inline uint32x4_t | | 97 | static __inline uint32x4_t |
98 | vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) | | 98 | vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) |
99 | { | | 99 | { |
100 | return __v0 + __v1; | | 100 | return __v0 + __v1; |
101 | } | | 101 | } |
102 | | | 102 | |
103 | _INTRINSATTR | | 103 | _INTRINSATTR |
104 | static __inline uint32x4_t | | 104 | static __inline uint32x4_t |
105 | vcltq_s32(int32x4_t __v0, int32x4_t __v1) | | 105 | vcltq_s32(int32x4_t __v0, int32x4_t __v1) |
106 | { | | 106 | { |
107 | return (uint32x4_t)(__v0 < __v1); | | 107 | return (uint32x4_t)(__v0 < __v1); |
108 | } | | 108 | } |
109 | | | 109 | |
110 | _INTRINSATTR | | 110 | _INTRINSATTR |
111 | static __inline int32x4_t | | 111 | static __inline int32x4_t |
112 | vdupq_n_s32(int32_t __x) | | 112 | vdupq_n_s32(int32_t __x) |
113 | { | | 113 | { |
114 | return (int32x4_t) { __x, __x, __x, __x }; | | 114 | return (int32x4_t) { __x, __x, __x, __x }; |
115 | } | | 115 | } |
116 | | | 116 | |
117 | _INTRINSATTR | | 117 | _INTRINSATTR |
118 | static __inline uint32x4_t | | 118 | static __inline uint32x4_t |
119 | vdupq_n_u32(uint32_t __x) | | 119 | vdupq_n_u32(uint32_t __x) |
120 | { | | 120 | { |
121 | return (uint32x4_t) { __x, __x, __x, __x }; | | 121 | return (uint32x4_t) { __x, __x, __x, __x }; |
122 | } | | 122 | } |
123 | | | 123 | |
124 | _INTRINSATTR | | 124 | _INTRINSATTR |
125 | static __inline uint8x16_t | | 125 | static __inline uint8x16_t |
126 | vdupq_n_u8(uint8_t __x) | | 126 | vdupq_n_u8(uint8_t __x) |
127 | { | | 127 | { |
128 | return (uint8x16_t) { | | 128 | return (uint8x16_t) { |
129 | __x, __x, __x, __x, __x, __x, __x, __x, | | 129 | __x, __x, __x, __x, __x, __x, __x, __x, |
130 | __x, __x, __x, __x, __x, __x, __x, __x, | | 130 | __x, __x, __x, __x, __x, __x, __x, __x, |
131 | }; | | 131 | }; |
132 | } | | 132 | } |
133 | | | 133 | |
134 | #if defined(__GNUC__) && !defined(__clang__) | | 134 | #if defined(__GNUC__) && !defined(__clang__) |
135 | _INTRINSATTR | | 135 | _INTRINSATTR |
136 | static __inline uint32x4_t | | 136 | static __inline uint32x4_t |
137 | vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) | | 137 | vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) |
138 | { | | 138 | { |
139 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | | 139 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) |
140 | return __builtin_shuffle(__hi, __lo, | | 140 | return __builtin_shuffle(__hi, __lo, |
141 | (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); | | 141 | (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); |
142 | #else | | 142 | #else |
143 | return __builtin_shuffle(__lo, __hi, | | 143 | return __builtin_shuffle(__lo, __hi, |
144 | (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); | | 144 | (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); |
145 | #endif | | 145 | #endif |
146 | } | | 146 | } |
147 | #elif defined(__clang__) | | 147 | #elif defined(__clang__) |
148 | #ifdef __LITTLE_ENDIAN__ | | 148 | #ifdef __LITTLE_ENDIAN__ |
149 | #define vextq_u32(__lo, __hi, __i) \ | | 149 | #define vextq_u32(__lo, __hi, __i) \ |
150 | (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ | | 150 | (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ |
151 | (int8x16_t)(__hi), (__i), 50) | | 151 | (int8x16_t)(__hi), (__i), 50) |
152 | #else | | 152 | #else |
153 | #define vextq_u32(__lo, __hi, __i) ( \ | | 153 | #define vextq_u32(__lo, __hi, __i) ( \ |
154 | { \ | | 154 | { \ |
155 | uint32x4_t __tlo = (__lo); \ | | 155 | uint32x4_t __tlo = (__lo); \ |
156 | uint32x4_t __thi = (__hi); \ | | 156 | uint32x4_t __thi = (__hi); \ |
157 | uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ | | 157 | uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ |
158 | uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ | | 158 | uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ |
159 | uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ | | 159 | uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ |
160 | (int8x16_t)__hi_r, __i, 50); \ | | 160 | (int8x16_t)__hi_r, __i, 50); \ |
161 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | | 161 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ |
162 | }) | | 162 | }) |
163 | #endif /* __LITTLE_ENDIAN__ */ | | 163 | #endif /* __LITTLE_ENDIAN__ */ |
164 | #endif | | 164 | #endif |
165 | | | 165 | |
166 | #if defined(__GNUC__) && !defined(__clang__) | | 166 | #if defined(__GNUC__) && !defined(__clang__) |
167 | _INTRINSATTR | | 167 | _INTRINSATTR |
168 | static __inline uint8x16_t | | 168 | static __inline uint8x16_t |
169 | vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) | | 169 | vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) |
170 | { | | 170 | { |
171 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | | 171 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) |
172 | return __builtin_shuffle(__hi, __lo, | | 172 | return __builtin_shuffle(__hi, __lo, |
173 | (uint8x16_t) { | | 173 | (uint8x16_t) { |
174 | 16 - __i, 17 - __i, 18 - __i, 19 - __i, | | 174 | 16 - __i, 17 - __i, 18 - __i, 19 - __i, |
175 | 20 - __i, 21 - __i, 22 - __i, 23 - __i, | | 175 | 20 - __i, 21 - __i, 22 - __i, 23 - __i, |
176 | 24 - __i, 25 - __i, 26 - __i, 27 - __i, | | 176 | 24 - __i, 25 - __i, 26 - __i, 27 - __i, |
177 | 28 - __i, 29 - __i, 30 - __i, 31 - __i, | | 177 | 28 - __i, 29 - __i, 30 - __i, 31 - __i, |
178 | }); | | 178 | }); |
179 | #else | | 179 | #else |
180 | return __builtin_shuffle(__lo, __hi, | | 180 | return __builtin_shuffle(__lo, __hi, |
181 | (uint8x16_t) { | | 181 | (uint8x16_t) { |
182 | __i + 0, __i + 1, __i + 2, __i + 3, | | 182 | __i + 0, __i + 1, __i + 2, __i + 3, |
183 | __i + 4, __i + 5, __i + 6, __i + 7, | | 183 | __i + 4, __i + 5, __i + 6, __i + 7, |
184 | __i + 8, __i + 9, __i + 10, __i + 11, | | 184 | __i + 8, __i + 9, __i + 10, __i + 11, |
185 | __i + 12, __i + 13, __i + 14, __i + 15, | | 185 | __i + 12, __i + 13, __i + 14, __i + 15, |
186 | }); | | 186 | }); |
187 | #endif | | 187 | #endif |
188 | } | | 188 | } |
189 | #elif defined(__clang__) | | 189 | #elif defined(__clang__) |
190 | #ifdef __LITTLE_ENDIAN__ | | 190 | #ifdef __LITTLE_ENDIAN__ |
191 | #define vextq_u8(__lo, __hi, __i) \ | | 191 | #define vextq_u8(__lo, __hi, __i) \ |
192 | (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ | | 192 | (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ |
193 | (int8x16_t)(__hi), (__i), 48) | | 193 | (int8x16_t)(__hi), (__i), 48) |
194 | #else | | 194 | #else |
195 | #define vextq_u8(__lo, __hi, __i) ( \ | | 195 | #define vextq_u8(__lo, __hi, __i) ( \ |
196 | { \ | | 196 | { \ |
197 | uint8x16_t __tlo = (__lo); \ | | 197 | uint8x16_t __tlo = (__lo); \ |
198 | uint8x16_t __thi = (__hi); \ | | 198 | uint8x16_t __thi = (__hi); \ |
199 | uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ | | 199 | uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ |
200 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | | 200 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ |
201 | uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ | | 201 | uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ |
202 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | | 202 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ |
203 | uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ | | 203 | uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ |
204 | (int8x16_t)__hi_r, (__i), 48); \ | | 204 | (int8x16_t)__hi_r, (__i), 48); \ |
205 | return __builtin_shufflevector(__r, __r, \ | | 205 | return __builtin_shufflevector(__r, __r, \ |
206 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | | 206 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ |
207 | }) | | 207 | }) |
208 | #endif /* __LITTLE_ENDIAN */ | | 208 | #endif /* __LITTLE_ENDIAN */ |
209 | #endif | | 209 | #endif |
210 | | | 210 | |
211 | #if defined(__GNUC__) && !defined(__clang__) | | 211 | #if defined(__GNUC__) && !defined(__clang__) |
212 | _INTRINSATTR | | 212 | _INTRINSATTR |
213 | static __inline uint32_t | | 213 | static __inline uint32_t |
214 | vgetq_lane_u32(uint32x4_t __v, uint8_t __i) | | 214 | vgetq_lane_u32(uint32x4_t __v, uint8_t __i) |
215 | { | | 215 | { |
216 | #ifdef __aarch64__ | | 216 | #ifdef __aarch64__ |
217 | return __v[__i]; | | 217 | return __v[__i]; |
218 | #else | | 218 | #else |
219 | return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); | | 219 | return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); |
220 | #endif | | 220 | #endif |
221 | } | | 221 | } |
222 | #elif defined(__clang__) | | 222 | #elif defined(__clang__) |
223 | #define vgetq_lane_u32(__v, __i) \ | | 223 | #define vgetq_lane_u32(__v, __i) \ |
224 | (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ | | 224 | (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ |
225 | __neon_lane_index(__v, __i)) | | 225 | __neon_lane_index(__v, __i)) |
226 | #endif | | 226 | #endif |
227 | | | 227 | |
228 | _INTRINSATTR | | 228 | _INTRINSATTR |
229 | static __inline uint32x4_t | | 229 | static __inline uint32x4_t |
230 | vld1q_u32(const uint32_t *__p32) | | 230 | vld1q_u32(const uint32_t *__p32) |
231 | { | | 231 | { |
232 | #if defined(__GNUC__) && !defined(__clang__) | | 232 | #if defined(__GNUC__) && !defined(__clang__) |
233 | #ifdef __aarch64__ | | 233 | #ifdef __aarch64__ |
234 | const __builtin_aarch64_simd_si *__p = | | 234 | const __builtin_aarch64_simd_si *__p = |
235 | (const __builtin_aarch64_simd_si *)__p32; | | 235 | (const __builtin_aarch64_simd_si *)__p32; |
236 | | | 236 | |
237 | return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); | | 237 | return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); |
238 | #else | | 238 | #else |
239 | const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; | | 239 | const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; |
240 | | | 240 | |
241 | return (uint32x4_t)__builtin_neon_vld1v4si(__p); | | 241 | return (uint32x4_t)__builtin_neon_vld1v4si(__p); |
242 | #endif | | 242 | #endif |
243 | #elif defined(__clang__) | | 243 | #elif defined(__clang__) |
244 | uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); | | 244 | uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); |
245 | #ifndef __LITTLE_ENDIAN__ | | 245 | #ifndef __LITTLE_ENDIAN__ |
246 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); | | 246 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); |
247 | #endif | | 247 | #endif |
248 | return __v; | | 248 | return __v; |
249 | #endif | | 249 | #endif |
250 | } | | 250 | } |
251 | | | 251 | |
252 | _INTRINSATTR | | 252 | _INTRINSATTR |
253 | static __inline uint8x16_t | | 253 | static __inline uint8x16_t |
254 | vld1q_u8(const uint8_t *__p8) | | 254 | vld1q_u8(const uint8_t *__p8) |
255 | { | | 255 | { |
256 | #if defined(__GNUC__) && !defined(__clang__) | | 256 | #if defined(__GNUC__) && !defined(__clang__) |
257 | #ifdef __aarch64__ | | 257 | #ifdef __aarch64__ |
258 | const __builtin_aarch64_simd_qi *__p = | | 258 | const __builtin_aarch64_simd_qi *__p = |
259 | (const __builtin_aarch64_simd_qi *)__p8; | | 259 | (const __builtin_aarch64_simd_qi *)__p8; |
260 | | | 260 | |
261 | return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); | | 261 | return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); |
262 | #else | | 262 | #else |
263 | const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; | | 263 | const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; |
264 | | | 264 | |
265 | return (uint8x16_t)__builtin_neon_vld1v16qi(__p); | | 265 | return (uint8x16_t)__builtin_neon_vld1v16qi(__p); |
266 | #endif | | 266 | #endif |
267 | #elif defined(__clang__) | | 267 | #elif defined(__clang__) |
268 | uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); | | 268 | uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); |
269 | #ifndef __LITTLE_ENDIAN__ | | 269 | #ifndef __LITTLE_ENDIAN__ |
270 | __v = __builtin_shufflevector(__v, __v, | | 270 | __v = __builtin_shufflevector(__v, __v, |
271 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | | 271 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); |
272 | #endif | | 272 | #endif |
273 | return __v; | | 273 | return __v; |
274 | #endif | | 274 | #endif |
275 | } | | 275 | } |
276 | | | 276 | |
277 | _INTRINSATTR | | 277 | _INTRINSATTR |
278 | static __inline uint8x16_t | | 278 | static __inline uint8x16_t |
279 | vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) | | 279 | vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) |
280 | { | | 280 | { |
281 | #if defined(__GNUC__) && !defined(__clang__) | | 281 | #if defined(__GNUC__) && !defined(__clang__) |
282 | #ifdef __aarch64__ | | 282 | #ifdef __aarch64__ |
283 | uint8x16_t __res; | | 283 | uint8x16_t __res; |
284 | __asm__("tbl %0.16b, {%1.16b}, %2.16b" | | 284 | __asm__("tbl %0.16b, {%1.16b}, %2.16b" |
285 | : "=w"(__res) : "w"(__tab), "w"(__idx)); | | 285 | : "=w"(__res) : "w"(__tab), "w"(__idx)); |
286 | return __res; | | 286 | return __res; |
287 | #else | | 287 | #else |
288 | /* | | 288 | /* |
289 | * No native ARMv7 NEON instruction for this, so do it via two | | 289 | * No native ARMv7 NEON instruction for this, so do it via two |
290 | * half-width TBLs instead (vtbl2_u8 equivalent). | | 290 | * half-width TBLs instead (vtbl2_u8 equivalent). |
291 | */ | | 291 | */ |
292 | uint64x2_t __tab64 = (uint64x2_t)__tab; | | 292 | uint64x2_t __tab64 = (uint64x2_t)__tab; |
293 | uint8x8_t __tablo = (uint8x8_t)__tab64[0]; | | 293 | uint8x8_t __tablo = (uint8x8_t)__tab64[0]; |
294 | uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; | | 294 | uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; |
295 | uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; | | 295 | uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; |
296 | union { | | 296 | union { |
297 | uint8x8x2_t __u8x8x2; | | 297 | uint8x8x2_t __u8x8x2; |
298 | __builtin_neon_ti __ti; | | 298 | __builtin_neon_ti __ti; |
299 | } __u = { __tab8x8x2 }; | | 299 | } __u = { __tab8x8x2 }; |
300 | uint64x2_t __idx64, __out64; | | 300 | uint64x2_t __idx64, __out64; |
301 | int8x8_t __idxlo, __idxhi, __outlo, __outhi; | | 301 | int8x8_t __idxlo, __idxhi, __outlo, __outhi; |
302 | | | 302 | |
303 | __idx64 = (uint64x2_t)__idx; | | 303 | __idx64 = (uint64x2_t)__idx; |
304 | __idxlo = (int8x8_t)__idx64[0]; | | 304 | __idxlo = (int8x8_t)__idx64[0]; |
305 | __idxhi = (int8x8_t)__idx64[1]; | | 305 | __idxhi = (int8x8_t)__idx64[1]; |
306 | __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); | | 306 | __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); |
307 | __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); | | 307 | __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); |
308 | __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; | | 308 | __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; |
309 | | | 309 | |
310 | return (uint8x16_t)__out64; | | 310 | return (uint8x16_t)__out64; |
311 | #endif | | 311 | #endif |
312 | #elif defined(__clang__) | | 312 | #elif defined(__clang__) |
313 | #ifdef __LITTLE_ENDIAN__ | | 313 | #ifdef __LITTLE_ENDIAN__ |
314 | return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, | | 314 | return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, |
315 | (int8x16_t)__idx, 48); | | 315 | (int8x16_t)__idx, 48); |
316 | #else | | 316 | #else |
317 | uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, | | 317 | uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, |
318 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | | 318 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); |
319 | uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, | | 319 | uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, |
320 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | | 320 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); |
321 | uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, | | 321 | uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, |
322 | (int8x16_t)__idx, __i, 48); | | 322 | (int8x16_t)__idx, __i, 48); |
323 | return __builtin_shufflevector(__r, __r, | | 323 | return __builtin_shufflevector(__r, __r, |
324 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | | 324 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); |
325 | #endif | | 325 | #endif |
326 | #endif | | 326 | #endif |
327 | } | | 327 | } |
328 | | | 328 | |
329 | _INTRINSATTR | | 329 | _INTRINSATTR |
330 | static __inline int32x4_t | | 330 | static __inline int32x4_t |
331 | vreinterpretq_s32_u8(uint8x16_t __v) | | 331 | vreinterpretq_s32_u8(uint8x16_t __v) |
332 | { | | 332 | { |
333 | return (int32x4_t)__v; | | 333 | return (int32x4_t)__v; |
334 | } | | 334 | } |
335 | | | 335 | |
336 | _INTRINSATTR | | 336 | _INTRINSATTR |
337 | static __inline uint16x8_t | | 337 | static __inline uint16x8_t |
338 | vreinterpretq_u16_u32(uint32x4_t __v) | | 338 | vreinterpretq_u16_u32(uint32x4_t __v) |
339 | { | | 339 | { |
340 | return (uint16x8_t)__v; | | 340 | return (uint16x8_t)__v; |
341 | } | | 341 | } |
342 | | | 342 | |
343 | _INTRINSATTR | | 343 | _INTRINSATTR |
344 | static __inline uint32x4_t | | 344 | static __inline uint32x4_t |
345 | vreinterpretq_u32_u16(uint16x8_t __v) | | 345 | vreinterpretq_u32_u16(uint16x8_t __v) |
346 | { | | 346 | { |
347 | return (uint32x4_t)__v; | | 347 | return (uint32x4_t)__v; |
348 | } | | 348 | } |
349 | | | 349 | |
350 | _INTRINSATTR | | 350 | _INTRINSATTR |
351 | static __inline uint32x4_t | | 351 | static __inline uint32x4_t |
352 | vreinterpretq_u32_u64(uint64x2_t __v) | | 352 | vreinterpretq_u32_u64(uint64x2_t __v) |
353 | { | | 353 | { |
354 | return (uint32x4_t)__v; | | 354 | return (uint32x4_t)__v; |
355 | } | | 355 | } |
356 | | | 356 | |
357 | _INTRINSATTR | | 357 | _INTRINSATTR |
358 | static __inline uint32x4_t | | 358 | static __inline uint32x4_t |
359 | vreinterpretq_u32_u8(uint8x16_t __v) | | 359 | vreinterpretq_u32_u8(uint8x16_t __v) |
360 | { | | 360 | { |
361 | return (uint32x4_t)__v; | | 361 | return (uint32x4_t)__v; |
362 | } | | 362 | } |
363 | | | 363 | |
364 | _INTRINSATTR | | 364 | _INTRINSATTR |
365 | static __inline uint64x2_t | | 365 | static __inline uint64x2_t |
366 | vreinterpretq_u64_u32(uint32x4_t __v) | | 366 | vreinterpretq_u64_u32(uint32x4_t __v) |
367 | { | | 367 | { |
368 | return (uint64x2_t)__v; | | 368 | return (uint64x2_t)__v; |
369 | } | | 369 | } |
370 | | | 370 | |
371 | _INTRINSATTR | | 371 | _INTRINSATTR |
372 | static __inline uint64x2_t | | 372 | static __inline uint64x2_t |
373 | vreinterpretq_u64_u8(uint8x16_t __v) | | 373 | vreinterpretq_u64_u8(uint8x16_t __v) |
374 | { | | 374 | { |
375 | return (uint64x2_t)__v; | | 375 | return (uint64x2_t)__v; |
376 | } | | 376 | } |
377 | | | 377 | |
378 | _INTRINSATTR | | 378 | _INTRINSATTR |
379 | static __inline uint8x16_t | | 379 | static __inline uint8x16_t |
380 | vreinterpretq_u8_s32(int32x4_t __v) | | 380 | vreinterpretq_u8_s32(int32x4_t __v) |
381 | { | | 381 | { |
382 | return (uint8x16_t)__v; | | 382 | return (uint8x16_t)__v; |
383 | } | | 383 | } |
384 | | | 384 | |
385 | _INTRINSATTR | | 385 | _INTRINSATTR |
386 | static __inline uint8x16_t | | 386 | static __inline uint8x16_t |
387 | vreinterpretq_u8_u32(uint32x4_t __v) | | 387 | vreinterpretq_u8_u32(uint32x4_t __v) |
388 | { | | 388 | { |
389 | return (uint8x16_t)__v; | | 389 | return (uint8x16_t)__v; |
390 | } | | 390 | } |
391 | | | 391 | |
392 | _INTRINSATTR | | 392 | _INTRINSATTR |
393 | static __inline uint8x16_t | | 393 | static __inline uint8x16_t |
394 | vreinterpretq_u8_u64(uint64x2_t __v) | | 394 | vreinterpretq_u8_u64(uint64x2_t __v) |
395 | { | | 395 | { |
396 | return (uint8x16_t)__v; | | 396 | return (uint8x16_t)__v; |
397 | } | | 397 | } |
398 | | | 398 | |
399 | _INTRINSATTR | | 399 | _INTRINSATTR |
400 | static __inline uint16x8_t | | 400 | static __inline uint16x8_t |
401 | vrev32q_u16(uint16x8_t __v) | | 401 | vrev32q_u16(uint16x8_t __v) |
402 | { | | 402 | { |
403 | #if defined(__GNUC__) && !defined(__clang__) | | 403 | #if defined(__GNUC__) && !defined(__clang__) |
404 | return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); | | 404 | return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); |
405 | #elif defined(__clang__) | | 405 | #elif defined(__clang__) |
406 | return __builtin_shufflevector(__v, 1,0, 3,2, 5,4, 7,6); | | 406 | return __builtin_shufflevector(__v, 1,0, 3,2, 5,4, 7,6); |
407 | #endif | | 407 | #endif |
408 | } | | 408 | } |
409 | | | 409 | |
410 | _INTRINSATTR | | 410 | _INTRINSATTR |
411 | static __inline uint8x16_t | | 411 | static __inline uint8x16_t |
412 | vrev32q_u8(uint8x16_t __v) | | 412 | vrev32q_u8(uint8x16_t __v) |
413 | { | | 413 | { |
414 | #if defined(__GNUC__) && !defined(__clang__) | | 414 | #if defined(__GNUC__) && !defined(__clang__) |
415 | return __builtin_shuffle(__v, | | 415 | return __builtin_shuffle(__v, |
416 | (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); | | 416 | (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); |
417 | #elif defined(__clang__) | | 417 | #elif defined(__clang__) |
418 | return __builtin_shufflevector(__v, | | 418 | return __builtin_shufflevector(__v, |
419 | 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); | | 419 | 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); |
420 | #endif | | 420 | #endif |
421 | } | | 421 | } |
422 | | | 422 | |
423 | #if defined(__GNUC__) && !defined(__clang__) | | 423 | #if defined(__GNUC__) && !defined(__clang__) |
424 | _INTRINSATTR | | 424 | _INTRINSATTR |
425 | static __inline uint32x4_t | | 425 | static __inline uint32x4_t |
426 | vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) | | 426 | vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) |
427 | { | | 427 | { |
428 | __v[__neon_lane_index(__v, __i)] = __x; | | 428 | __v[__neon_lane_index(__v, __i)] = __x; |
429 | return __v; | | 429 | return __v; |
430 | } | | 430 | } |
431 | #elif defined(__clang__) | | 431 | #elif defined(__clang__) |
432 | #define vsetq_lane_u32(__x, __v, __i) \ | | 432 | #define vsetq_lane_u32(__x, __v, __i) \ |
433 | (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ | | 433 | (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ |
434 | __neon_lane_index(__v, __i)) | | 434 | __neon_lane_index(__v, __i)) |
435 | #endif | | 435 | #endif |
436 | | | 436 | |
437 | #if defined(__GNUC__) && !defined(__clang__) | | 437 | #if defined(__GNUC__) && !defined(__clang__) |
438 | _INTRINSATTR | | 438 | _INTRINSATTR |
439 | static __inline uint64x2_t | | 439 | static __inline uint64x2_t |
440 | vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) | | 440 | vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) |
441 | { | | 441 | { |
442 | __v[__neon_lane_index(__v, __i)] = __x; | | 442 | __v[__neon_lane_index(__v, __i)] = __x; |
443 | return __v; | | 443 | return __v; |
444 | } | | 444 | } |
445 | #elif defined(__clang__) | | 445 | #elif defined(__clang__) |
446 | #define vsetq_lane_u64(__x, __v, __i) \ | | 446 | #define vsetq_lane_u64(__x, __v, __i) \ |
447 | (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ | | 447 | (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ |
448 | __neon_lane_index(__v, __i)); | | 448 | __neon_lane_index(__v, __i)); |
449 | #endif | | 449 | #endif |
450 | | | 450 | |
451 | #if defined(__GNUC__) && !defined(__clang__) | | 451 | #if defined(__GNUC__) && !defined(__clang__) |
452 | _INTRINSATTR | | 452 | _INTRINSATTR |
453 | static __inline uint32x4_t | | 453 | static __inline uint32x4_t |
454 | vshlq_n_u32(uint32x4_t __v, uint8_t __bits) | | 454 | vshlq_n_u32(uint32x4_t __v, uint8_t __bits) |
455 | { | | 455 | { |
456 | #ifdef __aarch64__ | | 456 | #ifdef __aarch64__ |
457 | return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); | | 457 | return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); |
458 | #else | | 458 | #else |
459 | return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); | | 459 | return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); |
460 | #endif | | 460 | #endif |
461 | } | | 461 | } |
462 | #elif defined(__clang__) | | 462 | #elif defined(__clang__) |
463 | #define vshlq_n_u32(__v, __bits) \ | | 463 | #define vshlq_n_u32(__v, __bits) \ |
464 | (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) | | 464 | (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) |
465 | #endif | | 465 | #endif |
466 | | | 466 | |
467 | #if defined(__GNUC__) && !defined(__clang__) | | 467 | #if defined(__GNUC__) && !defined(__clang__) |
468 | _INTRINSATTR | | 468 | _INTRINSATTR |
469 | static __inline uint32x4_t | | 469 | static __inline uint32x4_t |
470 | vshrq_n_u32(uint32x4_t __v, uint8_t __bits) | | 470 | vshrq_n_u32(uint32x4_t __v, uint8_t __bits) |
471 | { | | 471 | { |
472 | #ifdef __aarch64__ | | 472 | #ifdef __aarch64__ |
473 | return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); | | 473 | return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); |
474 | #else | | 474 | #else |
475 | return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); | | 475 | return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); |
476 | #endif | | 476 | #endif |
477 | } | | 477 | } |
478 | #elif defined(__clang__) | | 478 | #elif defined(__clang__) |
479 | #define vshrq_n_u8(__v, __bits) \ | | 479 | #define vshrq_n_u8(__v, __bits) \ |
480 | (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) | | 480 | (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) |
481 | #endif | | 481 | #endif |
482 | | | 482 | |
483 | #if defined(__GNUC__) && !defined(__clang__) | | 483 | #if defined(__GNUC__) && !defined(__clang__) |
484 | _INTRINSATTR | | 484 | _INTRINSATTR |
485 | static __inline uint8x16_t | | 485 | static __inline uint8x16_t |
486 | vshrq_n_u8(uint8x16_t __v, uint8_t __bits) | | 486 | vshrq_n_u8(uint8x16_t __v, uint8_t __bits) |
487 | { | | 487 | { |
488 | #ifdef __aarch64__ | | 488 | #ifdef __aarch64__ |
489 | return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); | | 489 | return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); |
490 | #else | | 490 | #else |
491 | return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); | | 491 | return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); |
492 | #endif | | 492 | #endif |
493 | } | | 493 | } |
494 | #elif defined(__clang__) | | 494 | #elif defined(__clang__) |
495 | #define vshrq_n_u8(__v, __bits) \ | | 495 | #define vshrq_n_u8(__v, __bits) \ |
496 | (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) | | 496 | (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) |
497 | #endif | | 497 | #endif |
498 | | | 498 | |
499 | #if defined(__GNUC__) && !defined(__clang__) | | 499 | #if defined(__GNUC__) && !defined(__clang__) |
500 | _INTRINSATTR | | 500 | _INTRINSATTR |
501 | static __inline int32x4_t | | 501 | static __inline int32x4_t |
502 | vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) | | 502 | vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) |
503 | { | | 503 | { |
504 | #ifdef __aarch64__ | | 504 | #ifdef __aarch64__ |
505 | return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); | | 505 | return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); |
506 | #else | | 506 | #else |
507 | return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); | | 507 | return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); |
508 | #endif | | 508 | #endif |
509 | } | | 509 | } |
510 | #elif defined(__clang__) | | 510 | #elif defined(__clang__) |
511 | #ifdef __LITTLE_ENDIAN__ | | 511 | #ifdef __LITTLE_ENDIAN__ |
512 | #define vsliq_n_s32(__vins, __vsh, __bits) \ | | 512 | #define vsliq_n_s32(__vins, __vsh, __bits) \ |
513 | (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ | | 513 | (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ |
514 | (int32x4_t)(__vsh), (__bits), 34) | | 514 | (int32x4_t)(__vsh), (__bits), 34) |
515 | #else | | 515 | #else |
516 | #define vsliq_n_s32(__vins, __vsh, __bits) ( \ | | 516 | #define vsliq_n_s32(__vins, __vsh, __bits) ( \ |
517 | { \ | | 517 | { \ |
518 | int32x4_t __tvins = (__vins); \ | | 518 | int32x4_t __tvins = (__vins); \ |
519 | int32x4_t __tvsh = (__vsh); \ | | 519 | int32x4_t __tvsh = (__vsh); \ |
520 | uint8_t __tbits = (__bits); \ | | 520 | uint8_t __tbits = (__bits); \ |
521 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ | | 521 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ |
522 | 3,2,1,0); \ | | 522 | 3,2,1,0); \ |
523 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ | | 523 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ |
524 | 3,2,1,0); \ | | 524 | 3,2,1,0); \ |
525 | int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ | | 525 | int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ |
526 | 34); \ | | 526 | 34); \ |
527 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | | 527 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ |
528 | }) | | 528 | }) |
529 | #endif /* __LITTLE_ENDIAN__ */ | | 529 | #endif /* __LITTLE_ENDIAN__ */ |
530 | #endif | | 530 | #endif |
531 | | | 531 | |
| | | 532 | #if defined(__GNUC__) && !defined(__clang__) |
| | | 533 | _INTRINSATTR |
| | | 534 | static __inline uint32x4_t |
| | | 535 | vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits) |
| | | 536 | { |
| | | 537 | #ifdef __aarch64__ |
| | | 538 | return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits); |
| | | 539 | #else |
| | | 540 | return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins, |
| | | 541 | (int32x4_t)__vsh, __bits); |
| | | 542 | #endif |
| | | 543 | } |
| | | 544 | #elif defined(__clang__) |
| | | 545 | #ifdef __LITTLE_ENDIAN__ |
| | | 546 | #define vsriq_n_u32(__vins, __vsh, __bits) \ |
| | | 547 | (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \ |
| | | 548 | (int32x4_t)(__vsh), (__bits), 34) |
| | | 549 | #else |
| | | 550 | #define vsliq_n_s32(__vins, __vsh, __bits) ( \ |
| | | 551 | { \ |
| | | 552 | int32x4_t __tvins = (__vins); \ |
| | | 553 | int32x4_t __tvsh = (__vsh); \ |
| | | 554 | uint8_t __tbits = (__bits); \ |
| | | 555 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ |
| | | 556 | 3,2,1,0); \ |
| | | 557 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ |
| | | 558 | 3,2,1,0); \ |
| | | 559 | int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \ |
| | | 560 | 34); \ |
| | | 561 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ |
| | | 562 | }) |
| | | 563 | #endif |
| | | 564 | #endif |
| | | 565 | |
532 | _INTRINSATTR | | 566 | _INTRINSATTR |
533 | static __inline void | | 567 | static __inline void |
534 | vst1q_u32(uint32_t *__p32, uint32x4_t __v) | | 568 | vst1q_u32(uint32_t *__p32, uint32x4_t __v) |
535 | { | | 569 | { |
536 | #if defined(__GNUC__) && !defined(__clang__) | | 570 | #if defined(__GNUC__) && !defined(__clang__) |
537 | #ifdef __aarch64__ | | 571 | #ifdef __aarch64__ |
538 | __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; | | 572 | __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; |
539 | | | 573 | |
540 | __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); | | 574 | __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); |
541 | #else | | 575 | #else |
542 | __builtin_neon_si *__p = (__builtin_neon_si *)__p32; | | 576 | __builtin_neon_si *__p = (__builtin_neon_si *)__p32; |
543 | | | 577 | |
544 | __builtin_neon_vst1v4si(__p, (int32x4_t)__v); | | 578 | __builtin_neon_vst1v4si(__p, (int32x4_t)__v); |
545 | #endif | | 579 | #endif |
546 | #elif defined(__clang__) | | 580 | #elif defined(__clang__) |
547 | #ifndef __LITTLE_ENDIAN__ | | 581 | #ifndef __LITTLE_ENDIAN__ |
548 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); | | 582 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); |
549 | #endif | | 583 | #endif |
550 | __builtin_neon_vst1q_v(__p32, __v, 50); | | 584 | __builtin_neon_vst1q_v(__p32, __v, 50); |
551 | #endif | | 585 | #endif |
552 | } | | 586 | } |
553 | | | 587 | |
554 | _INTRINSATTR | | 588 | _INTRINSATTR |
555 | static __inline void | | 589 | static __inline void |
556 | vst1q_u8(uint8_t *__p8, uint8x16_t __v) | | 590 | vst1q_u8(uint8_t *__p8, uint8x16_t __v) |
557 | { | | 591 | { |
558 | #if defined(__GNUC__) && !defined(__clang__) | | 592 | #if defined(__GNUC__) && !defined(__clang__) |
559 | #ifdef __aarch64__ | | 593 | #ifdef __aarch64__ |
560 | __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; | | 594 | __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; |
561 | | | 595 | |
562 | __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); | | 596 | __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); |
563 | #else | | 597 | #else |
564 | __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; | | 598 | __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; |
565 | | | 599 | |
566 | __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); | | 600 | __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); |
567 | #endif | | 601 | #endif |
568 | #elif defined(__clang__) | | 602 | #elif defined(__clang__) |
569 | #ifndef __LITTLE_ENDIAN__ | | 603 | #ifndef __LITTLE_ENDIAN__ |
570 | __v = __builtin_shufflevector(__v, __v, | | 604 | __v = __builtin_shufflevector(__v, __v, |
571 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | | 605 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); |
572 | #endif | | 606 | #endif |
573 | __builtin_neon_vst1q_v(__p8, __v, 48); | | 607 | __builtin_neon_vst1q_v(__p8, __v, 48); |
574 | #endif | | 608 | #endif |
575 | } | | 609 | } |
576 | | | 610 | |
577 | #ifndef __aarch64__ /* XXX */ | | 611 | #ifndef __aarch64__ /* XXX */ |
578 | | | 612 | |
579 | _INTRINSATTR | | 613 | _INTRINSATTR |
580 | static __inline uint8x8_t | | 614 | static __inline uint8x8_t |
581 | vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) | | 615 | vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) |
582 | { | | 616 | { |
583 | #if defined(__GNUC__) && !defined(__clang__) | | 617 | #if defined(__GNUC__) && !defined(__clang__) |
584 | return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, | | 618 | return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, |
585 | (int8x8_t)__idx); | | 619 | (int8x8_t)__idx); |
586 | #elif defined(__clang__) | | 620 | #elif defined(__clang__) |
587 | uint8x8_t __ret; | | 621 | uint8x8_t __ret; |
588 | #ifndef __LITTLE_ENDIAN__ | | 622 | #ifndef __LITTLE_ENDIAN__ |
589 | __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); | | 623 | __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); |
590 | __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); | | 624 | __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); |
591 | #endif | | 625 | #endif |
592 | __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, | | 626 | __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, |
593 | (int8x8_t)__idx, 16); | | 627 | (int8x8_t)__idx, 16); |
594 | #ifndef __LITTLE_ENDIAN__ | | 628 | #ifndef __LITTLE_ENDIAN__ |
595 | __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); | | 629 | __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); |
596 | #endif | | 630 | #endif |
597 | return __ret; | | 631 | return __ret; |
598 | #endif | | 632 | #endif |
599 | } | | 633 | } |
600 | | | 634 | |
601 | _INTRINSATTR | | 635 | _INTRINSATTR |
602 | static __inline uint8x8_t | | 636 | static __inline uint8x8_t |
603 | vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) | | 637 | vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) |
604 | { | | 638 | { |
605 | #if defined(__GNUC__) && !defined(__clang__) | | 639 | #if defined(__GNUC__) && !defined(__clang__) |
606 | union { | | 640 | union { |
607 | uint8x8x2_t __u8x8x82; | | 641 | uint8x8x2_t __u8x8x82; |
608 | __builtin_neon_ti __ti; | | 642 | __builtin_neon_ti __ti; |
609 | } __u = { __tab }; | | 643 | } __u = { __tab }; |
610 | return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); | | 644 | return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); |
611 | #elif defined(__clang__) | | 645 | #elif defined(__clang__) |
612 | uint8x8_t __ret; | | 646 | uint8x8_t __ret; |
613 | #ifndef __LITTLE_ENDIAN__ | | 647 | #ifndef __LITTLE_ENDIAN__ |
614 | __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], | | 648 | __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], |
615 | 7,6,5,4,3,2,1,0); | | 649 | 7,6,5,4,3,2,1,0); |
616 | __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], | | 650 | __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], |
617 | 7,6,5,4,3,2,1,0); | | 651 | 7,6,5,4,3,2,1,0); |
618 | __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); | | 652 | __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); |
619 | #endif | | 653 | #endif |
620 | __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], | | 654 | __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], |
621 | (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); | | 655 | (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); |
622 | #ifndef __LITTLE_ENDIAN__ | | 656 | #ifndef __LITTLE_ENDIAN__ |
623 | __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); | | 657 | __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); |
624 | #endif | | 658 | #endif |
625 | return __ret; | | 659 | return __ret; |
626 | #endif | | 660 | #endif |
627 | } | | 661 | } |
628 | | | 662 | |
629 | #endif /* !defined(__aarch64__) */ | | 663 | #endif /* !defined(__aarch64__) */ |
630 | | | 664 | |
631 | #endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */ | | 665 | #endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */ |