Mon Jul 27 20:58:56 2020 UTC ()
Note that VSRI seems to hurt here.


(riastradh)
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c

cvs diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h (switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/arm_neon.h 2020/07/27 20:58:06 1.2
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h 2020/07/27 20:58:56 1.3
@@ -1,631 +1,665 @@ @@ -1,631 +1,665 @@
1/* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */ 1/* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H 29#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
30#define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H 30#define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
31 31
32#if defined(__GNUC__) && !defined(__clang__) 32#if defined(__GNUC__) && !defined(__clang__)
33 33
34#define _INTRINSATTR \ 34#define _INTRINSATTR \
35 __extension__ \ 35 __extension__ \
36 __attribute__((__always_inline__, __gnu_inline__, __artificial__)) 36 __attribute__((__always_inline__, __gnu_inline__, __artificial__))
37 37
38#ifdef __aarch64__ 38#ifdef __aarch64__
39typedef __Int32x4_t int32x4_t; 39typedef __Int32x4_t int32x4_t;
40typedef __Int64x2_t int64x2_t; 40typedef __Int64x2_t int64x2_t;
41typedef __Int8x16_t int8x16_t; 41typedef __Int8x16_t int8x16_t;
42typedef __Uint16x8_t uint16x8_t; 42typedef __Uint16x8_t uint16x8_t;
43typedef __Uint32x4_t uint32x4_t; 43typedef __Uint32x4_t uint32x4_t;
44typedef __Uint64x2_t uint64x2_t; 44typedef __Uint64x2_t uint64x2_t;
45typedef __Uint8x16_t uint8x16_t; 45typedef __Uint8x16_t uint8x16_t;
46#else 46#else
47typedef __simd128_int32_t int32x4_t; 47typedef __simd128_int32_t int32x4_t;
48typedef __simd128_int64_t int64x2_t; 48typedef __simd128_int64_t int64x2_t;
49typedef __simd128_int8_t int8x16_t; 49typedef __simd128_int8_t int8x16_t;
50typedef __simd128_uint16_t uint16x8_t; 50typedef __simd128_uint16_t uint16x8_t;
51typedef __simd128_uint32_t uint32x4_t; 51typedef __simd128_uint32_t uint32x4_t;
52typedef __simd128_uint64_t uint64x2_t; 52typedef __simd128_uint64_t uint64x2_t;
53typedef __simd128_uint8_t uint8x16_t; 53typedef __simd128_uint8_t uint8x16_t;
54 54
55typedef __simd64_int8_t int8x8_t; 55typedef __simd64_int8_t int8x8_t;
56typedef __simd64_uint8_t uint8x8_t; 56typedef __simd64_uint8_t uint8x8_t;
57typedef __builtin_neon_udi uint64x1_t; 57typedef __builtin_neon_udi uint64x1_t;
58typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 58typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
59#endif 59#endif
60 60
61#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 61#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
62#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 62#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
63#else 63#else
64#define __neon_lane_index(__v, __i) __i 64#define __neon_lane_index(__v, __i) __i
65#endif 65#endif
66 66
67#elif defined(__clang__) 67#elif defined(__clang__)
68 68
69#define _INTRINSATTR \ 69#define _INTRINSATTR \
70 __attribute__((__always_inline__, __nodebug__)) 70 __attribute__((__always_inline__, __nodebug__))
71 71
72typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; 72typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
73typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; 73typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
74typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; 74typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
75 75
76typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; 76typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
77typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; 77typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
78typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; 78typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
79typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; 79typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
80 80
81typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; 81typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
82typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 82typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
83 83
84#ifdef __LITTLE_ENDIAN__ 84#ifdef __LITTLE_ENDIAN__
85#define __neon_lane_index(__v, __i) __i 85#define __neon_lane_index(__v, __i) __i
86#else 86#else
87#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 87#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
88#endif 88#endif
89 89
90#else 90#else
91 91
92#error Teach me how to neon in your compile! 92#error Teach me how to neon in your compile!
93 93
94#endif 94#endif
95 95
96_INTRINSATTR 96_INTRINSATTR
97static __inline uint32x4_t 97static __inline uint32x4_t
98vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) 98vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
99{ 99{
100 return __v0 + __v1; 100 return __v0 + __v1;
101} 101}
102 102
103_INTRINSATTR 103_INTRINSATTR
104static __inline uint32x4_t 104static __inline uint32x4_t
105vcltq_s32(int32x4_t __v0, int32x4_t __v1) 105vcltq_s32(int32x4_t __v0, int32x4_t __v1)
106{ 106{
107 return (uint32x4_t)(__v0 < __v1); 107 return (uint32x4_t)(__v0 < __v1);
108} 108}
109 109
110_INTRINSATTR 110_INTRINSATTR
111static __inline int32x4_t 111static __inline int32x4_t
112vdupq_n_s32(int32_t __x) 112vdupq_n_s32(int32_t __x)
113{ 113{
114 return (int32x4_t) { __x, __x, __x, __x }; 114 return (int32x4_t) { __x, __x, __x, __x };
115} 115}
116 116
117_INTRINSATTR 117_INTRINSATTR
118static __inline uint32x4_t 118static __inline uint32x4_t
119vdupq_n_u32(uint32_t __x) 119vdupq_n_u32(uint32_t __x)
120{ 120{
121 return (uint32x4_t) { __x, __x, __x, __x }; 121 return (uint32x4_t) { __x, __x, __x, __x };
122} 122}
123 123
124_INTRINSATTR 124_INTRINSATTR
125static __inline uint8x16_t 125static __inline uint8x16_t
126vdupq_n_u8(uint8_t __x) 126vdupq_n_u8(uint8_t __x)
127{ 127{
128 return (uint8x16_t) { 128 return (uint8x16_t) {
129 __x, __x, __x, __x, __x, __x, __x, __x, 129 __x, __x, __x, __x, __x, __x, __x, __x,
130 __x, __x, __x, __x, __x, __x, __x, __x, 130 __x, __x, __x, __x, __x, __x, __x, __x,
131 }; 131 };
132} 132}
133 133
134#if defined(__GNUC__) && !defined(__clang__) 134#if defined(__GNUC__) && !defined(__clang__)
135_INTRINSATTR 135_INTRINSATTR
136static __inline uint32x4_t 136static __inline uint32x4_t
137vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) 137vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
138{ 138{
139#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 139#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
140 return __builtin_shuffle(__hi, __lo, 140 return __builtin_shuffle(__hi, __lo,
141 (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); 141 (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
142#else 142#else
143 return __builtin_shuffle(__lo, __hi, 143 return __builtin_shuffle(__lo, __hi,
144 (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); 144 (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
145#endif 145#endif
146} 146}
147#elif defined(__clang__) 147#elif defined(__clang__)
148#ifdef __LITTLE_ENDIAN__ 148#ifdef __LITTLE_ENDIAN__
149#define vextq_u32(__lo, __hi, __i) \ 149#define vextq_u32(__lo, __hi, __i) \
150 (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 150 (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
151 (int8x16_t)(__hi), (__i), 50) 151 (int8x16_t)(__hi), (__i), 50)
152#else 152#else
153#define vextq_u32(__lo, __hi, __i) ( \ 153#define vextq_u32(__lo, __hi, __i) ( \
154{ \ 154{ \
155 uint32x4_t __tlo = (__lo); \ 155 uint32x4_t __tlo = (__lo); \
156 uint32x4_t __thi = (__hi); \ 156 uint32x4_t __thi = (__hi); \
157 uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ 157 uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \
158 uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ 158 uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \
159 uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 159 uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
160 (int8x16_t)__hi_r, __i, 50); \ 160 (int8x16_t)__hi_r, __i, 50); \
161 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 161 __builtin_shufflevector(__r, __r, 3,2,1,0); \
162}) 162})
163#endif /* __LITTLE_ENDIAN__ */ 163#endif /* __LITTLE_ENDIAN__ */
164#endif 164#endif
165 165
166#if defined(__GNUC__) && !defined(__clang__) 166#if defined(__GNUC__) && !defined(__clang__)
167_INTRINSATTR 167_INTRINSATTR
168static __inline uint8x16_t 168static __inline uint8x16_t
169vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) 169vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
170{ 170{
171#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 171#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
172 return __builtin_shuffle(__hi, __lo, 172 return __builtin_shuffle(__hi, __lo,
173 (uint8x16_t) { 173 (uint8x16_t) {
174 16 - __i, 17 - __i, 18 - __i, 19 - __i, 174 16 - __i, 17 - __i, 18 - __i, 19 - __i,
175 20 - __i, 21 - __i, 22 - __i, 23 - __i, 175 20 - __i, 21 - __i, 22 - __i, 23 - __i,
176 24 - __i, 25 - __i, 26 - __i, 27 - __i, 176 24 - __i, 25 - __i, 26 - __i, 27 - __i,
177 28 - __i, 29 - __i, 30 - __i, 31 - __i, 177 28 - __i, 29 - __i, 30 - __i, 31 - __i,
178 }); 178 });
179#else 179#else
180 return __builtin_shuffle(__lo, __hi, 180 return __builtin_shuffle(__lo, __hi,
181 (uint8x16_t) { 181 (uint8x16_t) {
182 __i + 0, __i + 1, __i + 2, __i + 3, 182 __i + 0, __i + 1, __i + 2, __i + 3,
183 __i + 4, __i + 5, __i + 6, __i + 7, 183 __i + 4, __i + 5, __i + 6, __i + 7,
184 __i + 8, __i + 9, __i + 10, __i + 11, 184 __i + 8, __i + 9, __i + 10, __i + 11,
185 __i + 12, __i + 13, __i + 14, __i + 15, 185 __i + 12, __i + 13, __i + 14, __i + 15,
186 }); 186 });
187#endif 187#endif
188} 188}
189#elif defined(__clang__) 189#elif defined(__clang__)
190#ifdef __LITTLE_ENDIAN__ 190#ifdef __LITTLE_ENDIAN__
191#define vextq_u8(__lo, __hi, __i) \ 191#define vextq_u8(__lo, __hi, __i) \
192 (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 192 (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
193 (int8x16_t)(__hi), (__i), 48) 193 (int8x16_t)(__hi), (__i), 48)
194#else 194#else
195#define vextq_u8(__lo, __hi, __i) ( \ 195#define vextq_u8(__lo, __hi, __i) ( \
196{ \ 196{ \
197 uint8x16_t __tlo = (__lo); \ 197 uint8x16_t __tlo = (__lo); \
198 uint8x16_t __thi = (__hi); \ 198 uint8x16_t __thi = (__hi); \
199 uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ 199 uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \
200 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 200 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
201 uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ 201 uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \
202 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 202 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
203 uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 203 uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
204 (int8x16_t)__hi_r, (__i), 48); \ 204 (int8x16_t)__hi_r, (__i), 48); \
205 return __builtin_shufflevector(__r, __r, \ 205 return __builtin_shufflevector(__r, __r, \
206 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 206 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
207}) 207})
208#endif /* __LITTLE_ENDIAN */ 208#endif /* __LITTLE_ENDIAN */
209#endif 209#endif
210 210
211#if defined(__GNUC__) && !defined(__clang__) 211#if defined(__GNUC__) && !defined(__clang__)
212_INTRINSATTR 212_INTRINSATTR
213static __inline uint32_t 213static __inline uint32_t
214vgetq_lane_u32(uint32x4_t __v, uint8_t __i) 214vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
215{ 215{
216#ifdef __aarch64__ 216#ifdef __aarch64__
217 return __v[__i]; 217 return __v[__i];
218#else 218#else
219 return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); 219 return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
220#endif 220#endif
221} 221}
222#elif defined(__clang__) 222#elif defined(__clang__)
223#define vgetq_lane_u32(__v, __i) \ 223#define vgetq_lane_u32(__v, __i) \
224 (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ 224 (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \
225 __neon_lane_index(__v, __i)) 225 __neon_lane_index(__v, __i))
226#endif 226#endif
227 227
228_INTRINSATTR 228_INTRINSATTR
229static __inline uint32x4_t 229static __inline uint32x4_t
230vld1q_u32(const uint32_t *__p32) 230vld1q_u32(const uint32_t *__p32)
231{ 231{
232#if defined(__GNUC__) && !defined(__clang__) 232#if defined(__GNUC__) && !defined(__clang__)
233#ifdef __aarch64__ 233#ifdef __aarch64__
234 const __builtin_aarch64_simd_si *__p = 234 const __builtin_aarch64_simd_si *__p =
235 (const __builtin_aarch64_simd_si *)__p32; 235 (const __builtin_aarch64_simd_si *)__p32;
236 236
237 return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); 237 return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
238#else 238#else
239 const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; 239 const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
240 240
241 return (uint32x4_t)__builtin_neon_vld1v4si(__p); 241 return (uint32x4_t)__builtin_neon_vld1v4si(__p);
242#endif 242#endif
243#elif defined(__clang__) 243#elif defined(__clang__)
244 uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); 244 uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
245#ifndef __LITTLE_ENDIAN__ 245#ifndef __LITTLE_ENDIAN__
246 __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 246 __v = __builtin_shufflevector(__v, __v, 3,2,1,0);
247#endif 247#endif
248 return __v; 248 return __v;
249#endif 249#endif
250} 250}
251 251
252_INTRINSATTR 252_INTRINSATTR
253static __inline uint8x16_t 253static __inline uint8x16_t
254vld1q_u8(const uint8_t *__p8) 254vld1q_u8(const uint8_t *__p8)
255{ 255{
256#if defined(__GNUC__) && !defined(__clang__) 256#if defined(__GNUC__) && !defined(__clang__)
257#ifdef __aarch64__ 257#ifdef __aarch64__
258 const __builtin_aarch64_simd_qi *__p = 258 const __builtin_aarch64_simd_qi *__p =
259 (const __builtin_aarch64_simd_qi *)__p8; 259 (const __builtin_aarch64_simd_qi *)__p8;
260 260
261 return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); 261 return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
262#else 262#else
263 const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; 263 const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
264 264
265 return (uint8x16_t)__builtin_neon_vld1v16qi(__p); 265 return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
266#endif 266#endif
267#elif defined(__clang__) 267#elif defined(__clang__)
268 uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); 268 uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
269#ifndef __LITTLE_ENDIAN__ 269#ifndef __LITTLE_ENDIAN__
270 __v = __builtin_shufflevector(__v, __v, 270 __v = __builtin_shufflevector(__v, __v,
271 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 271 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
272#endif 272#endif
273 return __v; 273 return __v;
274#endif 274#endif
275} 275}
276 276
277_INTRINSATTR 277_INTRINSATTR
278static __inline uint8x16_t 278static __inline uint8x16_t
279vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) 279vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
280{ 280{
281#if defined(__GNUC__) && !defined(__clang__) 281#if defined(__GNUC__) && !defined(__clang__)
282#ifdef __aarch64__ 282#ifdef __aarch64__
283 uint8x16_t __res; 283 uint8x16_t __res;
284 __asm__("tbl %0.16b, {%1.16b}, %2.16b" 284 __asm__("tbl %0.16b, {%1.16b}, %2.16b"
285 : "=w"(__res) : "w"(__tab), "w"(__idx)); 285 : "=w"(__res) : "w"(__tab), "w"(__idx));
286 return __res; 286 return __res;
287#else 287#else
288 /* 288 /*
289 * No native ARMv7 NEON instruction for this, so do it via two 289 * No native ARMv7 NEON instruction for this, so do it via two
290 * half-width TBLs instead (vtbl2_u8 equivalent). 290 * half-width TBLs instead (vtbl2_u8 equivalent).
291 */ 291 */
292 uint64x2_t __tab64 = (uint64x2_t)__tab; 292 uint64x2_t __tab64 = (uint64x2_t)__tab;
293 uint8x8_t __tablo = (uint8x8_t)__tab64[0]; 293 uint8x8_t __tablo = (uint8x8_t)__tab64[0];
294 uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; 294 uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
295 uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; 295 uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
296 union { 296 union {
297 uint8x8x2_t __u8x8x2; 297 uint8x8x2_t __u8x8x2;
298 __builtin_neon_ti __ti; 298 __builtin_neon_ti __ti;
299 } __u = { __tab8x8x2 }; 299 } __u = { __tab8x8x2 };
300 uint64x2_t __idx64, __out64; 300 uint64x2_t __idx64, __out64;
301 int8x8_t __idxlo, __idxhi, __outlo, __outhi; 301 int8x8_t __idxlo, __idxhi, __outlo, __outhi;
302 302
303 __idx64 = (uint64x2_t)__idx; 303 __idx64 = (uint64x2_t)__idx;
304 __idxlo = (int8x8_t)__idx64[0]; 304 __idxlo = (int8x8_t)__idx64[0];
305 __idxhi = (int8x8_t)__idx64[1]; 305 __idxhi = (int8x8_t)__idx64[1];
306 __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); 306 __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
307 __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); 307 __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
308 __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; 308 __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
309 309
310 return (uint8x16_t)__out64; 310 return (uint8x16_t)__out64;
311#endif 311#endif
312#elif defined(__clang__) 312#elif defined(__clang__)
313#ifdef __LITTLE_ENDIAN__ 313#ifdef __LITTLE_ENDIAN__
314 return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, 314 return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
315 (int8x16_t)__idx, 48); 315 (int8x16_t)__idx, 48);
316#else 316#else
317 uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, 317 uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
318 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 318 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
319 uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, 319 uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
320 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 320 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
321 uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, 321 uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
322 (int8x16_t)__idx, __i, 48); 322 (int8x16_t)__idx, __i, 48);
323 return __builtin_shufflevector(__r, __r, 323 return __builtin_shufflevector(__r, __r,
324 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 324 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
325#endif 325#endif
326#endif 326#endif
327} 327}
328 328
329_INTRINSATTR 329_INTRINSATTR
330static __inline int32x4_t 330static __inline int32x4_t
331vreinterpretq_s32_u8(uint8x16_t __v) 331vreinterpretq_s32_u8(uint8x16_t __v)
332{ 332{
333 return (int32x4_t)__v; 333 return (int32x4_t)__v;
334} 334}
335 335
336_INTRINSATTR 336_INTRINSATTR
337static __inline uint16x8_t 337static __inline uint16x8_t
338vreinterpretq_u16_u32(uint32x4_t __v) 338vreinterpretq_u16_u32(uint32x4_t __v)
339{ 339{
340 return (uint16x8_t)__v; 340 return (uint16x8_t)__v;
341} 341}
342 342
343_INTRINSATTR 343_INTRINSATTR
344static __inline uint32x4_t 344static __inline uint32x4_t
345vreinterpretq_u32_u16(uint16x8_t __v) 345vreinterpretq_u32_u16(uint16x8_t __v)
346{ 346{
347 return (uint32x4_t)__v; 347 return (uint32x4_t)__v;
348} 348}
349 349
350_INTRINSATTR 350_INTRINSATTR
351static __inline uint32x4_t 351static __inline uint32x4_t
352vreinterpretq_u32_u64(uint64x2_t __v) 352vreinterpretq_u32_u64(uint64x2_t __v)
353{ 353{
354 return (uint32x4_t)__v; 354 return (uint32x4_t)__v;
355} 355}
356 356
357_INTRINSATTR 357_INTRINSATTR
358static __inline uint32x4_t 358static __inline uint32x4_t
359vreinterpretq_u32_u8(uint8x16_t __v) 359vreinterpretq_u32_u8(uint8x16_t __v)
360{ 360{
361 return (uint32x4_t)__v; 361 return (uint32x4_t)__v;
362} 362}
363 363
364_INTRINSATTR 364_INTRINSATTR
365static __inline uint64x2_t 365static __inline uint64x2_t
366vreinterpretq_u64_u32(uint32x4_t __v) 366vreinterpretq_u64_u32(uint32x4_t __v)
367{ 367{
368 return (uint64x2_t)__v; 368 return (uint64x2_t)__v;
369} 369}
370 370
371_INTRINSATTR 371_INTRINSATTR
372static __inline uint64x2_t 372static __inline uint64x2_t
373vreinterpretq_u64_u8(uint8x16_t __v) 373vreinterpretq_u64_u8(uint8x16_t __v)
374{ 374{
375 return (uint64x2_t)__v; 375 return (uint64x2_t)__v;
376} 376}
377 377
378_INTRINSATTR 378_INTRINSATTR
379static __inline uint8x16_t 379static __inline uint8x16_t
380vreinterpretq_u8_s32(int32x4_t __v) 380vreinterpretq_u8_s32(int32x4_t __v)
381{ 381{
382 return (uint8x16_t)__v; 382 return (uint8x16_t)__v;
383} 383}
384 384
385_INTRINSATTR 385_INTRINSATTR
386static __inline uint8x16_t 386static __inline uint8x16_t
387vreinterpretq_u8_u32(uint32x4_t __v) 387vreinterpretq_u8_u32(uint32x4_t __v)
388{ 388{
389 return (uint8x16_t)__v; 389 return (uint8x16_t)__v;
390} 390}
391 391
392_INTRINSATTR 392_INTRINSATTR
393static __inline uint8x16_t 393static __inline uint8x16_t
394vreinterpretq_u8_u64(uint64x2_t __v) 394vreinterpretq_u8_u64(uint64x2_t __v)
395{ 395{
396 return (uint8x16_t)__v; 396 return (uint8x16_t)__v;
397} 397}
398 398
399_INTRINSATTR 399_INTRINSATTR
400static __inline uint16x8_t 400static __inline uint16x8_t
401vrev32q_u16(uint16x8_t __v) 401vrev32q_u16(uint16x8_t __v)
402{ 402{
403#if defined(__GNUC__) && !defined(__clang__) 403#if defined(__GNUC__) && !defined(__clang__)
404 return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 }); 404 return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
405#elif defined(__clang__) 405#elif defined(__clang__)
406 return __builtin_shufflevector(__v, 1,0, 3,2, 5,4, 7,6); 406 return __builtin_shufflevector(__v, 1,0, 3,2, 5,4, 7,6);
407#endif 407#endif
408} 408}
409 409
410_INTRINSATTR 410_INTRINSATTR
411static __inline uint8x16_t 411static __inline uint8x16_t
412vrev32q_u8(uint8x16_t __v) 412vrev32q_u8(uint8x16_t __v)
413{ 413{
414#if defined(__GNUC__) && !defined(__clang__) 414#if defined(__GNUC__) && !defined(__clang__)
415 return __builtin_shuffle(__v, 415 return __builtin_shuffle(__v,
416 (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); 416 (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
417#elif defined(__clang__) 417#elif defined(__clang__)
418 return __builtin_shufflevector(__v, 418 return __builtin_shufflevector(__v,
419 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); 419 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
420#endif 420#endif
421} 421}
422 422
423#if defined(__GNUC__) && !defined(__clang__) 423#if defined(__GNUC__) && !defined(__clang__)
424_INTRINSATTR 424_INTRINSATTR
425static __inline uint32x4_t 425static __inline uint32x4_t
426vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) 426vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
427{ 427{
428 __v[__neon_lane_index(__v, __i)] = __x; 428 __v[__neon_lane_index(__v, __i)] = __x;
429 return __v; 429 return __v;
430} 430}
431#elif defined(__clang__) 431#elif defined(__clang__)
432#define vsetq_lane_u32(__x, __v, __i) \ 432#define vsetq_lane_u32(__x, __v, __i) \
433 (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ 433 (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \
434 __neon_lane_index(__v, __i)) 434 __neon_lane_index(__v, __i))
435#endif 435#endif
436 436
437#if defined(__GNUC__) && !defined(__clang__) 437#if defined(__GNUC__) && !defined(__clang__)
438_INTRINSATTR 438_INTRINSATTR
439static __inline uint64x2_t 439static __inline uint64x2_t
440vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) 440vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
441{ 441{
442 __v[__neon_lane_index(__v, __i)] = __x; 442 __v[__neon_lane_index(__v, __i)] = __x;
443 return __v; 443 return __v;
444} 444}
445#elif defined(__clang__) 445#elif defined(__clang__)
446#define vsetq_lane_u64(__x, __v, __i) \ 446#define vsetq_lane_u64(__x, __v, __i) \
447 (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ 447 (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \
448 __neon_lane_index(__v, __i)); 448 __neon_lane_index(__v, __i));
449#endif 449#endif
450 450
451#if defined(__GNUC__) && !defined(__clang__) 451#if defined(__GNUC__) && !defined(__clang__)
452_INTRINSATTR 452_INTRINSATTR
453static __inline uint32x4_t 453static __inline uint32x4_t
454vshlq_n_u32(uint32x4_t __v, uint8_t __bits) 454vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
455{ 455{
456#ifdef __aarch64__ 456#ifdef __aarch64__
457 return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); 457 return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
458#else 458#else
459 return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); 459 return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
460#endif 460#endif
461} 461}
462#elif defined(__clang__) 462#elif defined(__clang__)
463#define vshlq_n_u32(__v, __bits) \ 463#define vshlq_n_u32(__v, __bits) \
464 (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) 464 (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
465#endif 465#endif
466 466
467#if defined(__GNUC__) && !defined(__clang__) 467#if defined(__GNUC__) && !defined(__clang__)
468_INTRINSATTR 468_INTRINSATTR
469static __inline uint32x4_t 469static __inline uint32x4_t
470vshrq_n_u32(uint32x4_t __v, uint8_t __bits) 470vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
471{ 471{
472#ifdef __aarch64__ 472#ifdef __aarch64__
473 return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); 473 return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
474#else 474#else
475 return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); 475 return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
476#endif 476#endif
477} 477}
478#elif defined(__clang__) 478#elif defined(__clang__)
479#define vshrq_n_u8(__v, __bits) \ 479#define vshrq_n_u8(__v, __bits) \
480 (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) 480 (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
481#endif 481#endif
482 482
483#if defined(__GNUC__) && !defined(__clang__) 483#if defined(__GNUC__) && !defined(__clang__)
484_INTRINSATTR 484_INTRINSATTR
485static __inline uint8x16_t 485static __inline uint8x16_t
486vshrq_n_u8(uint8x16_t __v, uint8_t __bits) 486vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
487{ 487{
488#ifdef __aarch64__ 488#ifdef __aarch64__
489 return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); 489 return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
490#else 490#else
491 return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); 491 return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
492#endif 492#endif
493} 493}
494#elif defined(__clang__) 494#elif defined(__clang__)
495#define vshrq_n_u8(__v, __bits) \ 495#define vshrq_n_u8(__v, __bits) \
496 (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) 496 (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
497#endif 497#endif
498 498
499#if defined(__GNUC__) && !defined(__clang__) 499#if defined(__GNUC__) && !defined(__clang__)
500_INTRINSATTR 500_INTRINSATTR
501static __inline int32x4_t 501static __inline int32x4_t
502vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) 502vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
503{ 503{
504#ifdef __aarch64__ 504#ifdef __aarch64__
505 return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); 505 return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
506#else 506#else
507 return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); 507 return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
508#endif 508#endif
509} 509}
510#elif defined(__clang__) 510#elif defined(__clang__)
511#ifdef __LITTLE_ENDIAN__ 511#ifdef __LITTLE_ENDIAN__
512#define vsliq_n_s32(__vins, __vsh, __bits) \ 512#define vsliq_n_s32(__vins, __vsh, __bits) \
513 (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ 513 (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \
514 (int32x4_t)(__vsh), (__bits), 34) 514 (int32x4_t)(__vsh), (__bits), 34)
515#else 515#else
516#define vsliq_n_s32(__vins, __vsh, __bits) ( \ 516#define vsliq_n_s32(__vins, __vsh, __bits) ( \
517{ \ 517{ \
518 int32x4_t __tvins = (__vins); \ 518 int32x4_t __tvins = (__vins); \
519 int32x4_t __tvsh = (__vsh); \ 519 int32x4_t __tvsh = (__vsh); \
520 uint8_t __tbits = (__bits); \ 520 uint8_t __tbits = (__bits); \
521 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 521 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
522 3,2,1,0); \ 522 3,2,1,0); \
523 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 523 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
524 3,2,1,0); \ 524 3,2,1,0); \
525 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ 525 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \
526 34); \ 526 34); \
527 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 527 __builtin_shufflevector(__r, __r, 3,2,1,0); \
528}) 528})
529#endif /* __LITTLE_ENDIAN__ */ 529#endif /* __LITTLE_ENDIAN__ */
530#endif 530#endif
531 531
 532#if defined(__GNUC__) && !defined(__clang__)
 533_INTRINSATTR
 534static __inline uint32x4_t
 535vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
 536{
 537#ifdef __aarch64__
 538 return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
 539#else
 540 return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
 541 (int32x4_t)__vsh, __bits);
 542#endif
 543}
 544#elif defined(__clang__)
 545#ifdef __LITTLE_ENDIAN__
 546#define vsriq_n_u32(__vins, __vsh, __bits) \
 547 (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \
 548 (int32x4_t)(__vsh), (__bits), 34)
 549#else
 550#define vsliq_n_s32(__vins, __vsh, __bits) ( \
 551{ \
 552 int32x4_t __tvins = (__vins); \
 553 int32x4_t __tvsh = (__vsh); \
 554 uint8_t __tbits = (__bits); \
 555 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
 556 3,2,1,0); \
 557 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
 558 3,2,1,0); \
 559 int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \
 560 34); \
 561 __builtin_shufflevector(__r, __r, 3,2,1,0); \
 562})
 563#endif
 564#endif
 565
532_INTRINSATTR 566_INTRINSATTR
533static __inline void 567static __inline void
534vst1q_u32(uint32_t *__p32, uint32x4_t __v) 568vst1q_u32(uint32_t *__p32, uint32x4_t __v)
535{ 569{
536#if defined(__GNUC__) && !defined(__clang__) 570#if defined(__GNUC__) && !defined(__clang__)
537#ifdef __aarch64__ 571#ifdef __aarch64__
538 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; 572 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
539 573
540 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); 574 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
541#else 575#else
542 __builtin_neon_si *__p = (__builtin_neon_si *)__p32; 576 __builtin_neon_si *__p = (__builtin_neon_si *)__p32;
543 577
544 __builtin_neon_vst1v4si(__p, (int32x4_t)__v); 578 __builtin_neon_vst1v4si(__p, (int32x4_t)__v);
545#endif 579#endif
546#elif defined(__clang__) 580#elif defined(__clang__)
547#ifndef __LITTLE_ENDIAN__ 581#ifndef __LITTLE_ENDIAN__
548 __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 582 __v = __builtin_shufflevector(__v, __v, 3,2,1,0);
549#endif 583#endif
550 __builtin_neon_vst1q_v(__p32, __v, 50); 584 __builtin_neon_vst1q_v(__p32, __v, 50);
551#endif 585#endif
552} 586}
553 587
554_INTRINSATTR 588_INTRINSATTR
555static __inline void 589static __inline void
556vst1q_u8(uint8_t *__p8, uint8x16_t __v) 590vst1q_u8(uint8_t *__p8, uint8x16_t __v)
557{ 591{
558#if defined(__GNUC__) && !defined(__clang__) 592#if defined(__GNUC__) && !defined(__clang__)
559#ifdef __aarch64__ 593#ifdef __aarch64__
560 __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; 594 __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
561 595
562 __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); 596 __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
563#else 597#else
564 __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; 598 __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
565 599
566 __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); 600 __builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
567#endif 601#endif
568#elif defined(__clang__) 602#elif defined(__clang__)
569#ifndef __LITTLE_ENDIAN__ 603#ifndef __LITTLE_ENDIAN__
570 __v = __builtin_shufflevector(__v, __v, 604 __v = __builtin_shufflevector(__v, __v,
571 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 605 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
572#endif 606#endif
573 __builtin_neon_vst1q_v(__p8, __v, 48); 607 __builtin_neon_vst1q_v(__p8, __v, 48);
574#endif 608#endif
575} 609}
576 610
577#ifndef __aarch64__ /* XXX */ 611#ifndef __aarch64__ /* XXX */
578 612
579_INTRINSATTR 613_INTRINSATTR
580static __inline uint8x8_t 614static __inline uint8x8_t
581vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx) 615vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
582{ 616{
583#if defined(__GNUC__) && !defined(__clang__) 617#if defined(__GNUC__) && !defined(__clang__)
584 return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab, 618 return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
585 (int8x8_t)__idx); 619 (int8x8_t)__idx);
586#elif defined(__clang__) 620#elif defined(__clang__)
587 uint8x8_t __ret; 621 uint8x8_t __ret;
588#ifndef __LITTLE_ENDIAN__ 622#ifndef __LITTLE_ENDIAN__
589 __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0); 623 __tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
590 __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); 624 __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
591#endif 625#endif
592 __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab, 626 __ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
593 (int8x8_t)__idx, 16); 627 (int8x8_t)__idx, 16);
594#ifndef __LITTLE_ENDIAN__ 628#ifndef __LITTLE_ENDIAN__
595 __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); 629 __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
596#endif 630#endif
597 return __ret; 631 return __ret;
598#endif 632#endif
599} 633}
600 634
601_INTRINSATTR 635_INTRINSATTR
602static __inline uint8x8_t 636static __inline uint8x8_t
603vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx) 637vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
604{ 638{
605#if defined(__GNUC__) && !defined(__clang__) 639#if defined(__GNUC__) && !defined(__clang__)
606 union { 640 union {
607 uint8x8x2_t __u8x8x82; 641 uint8x8x2_t __u8x8x82;
608 __builtin_neon_ti __ti; 642 __builtin_neon_ti __ti;
609 } __u = { __tab }; 643 } __u = { __tab };
610 return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx); 644 return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
611#elif defined(__clang__) 645#elif defined(__clang__)
612 uint8x8_t __ret; 646 uint8x8_t __ret;
613#ifndef __LITTLE_ENDIAN__ 647#ifndef __LITTLE_ENDIAN__
614 __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0], 648 __tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
615 7,6,5,4,3,2,1,0); 649 7,6,5,4,3,2,1,0);
616 __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1], 650 __tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
617 7,6,5,4,3,2,1,0); 651 7,6,5,4,3,2,1,0);
618 __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0); 652 __idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
619#endif 653#endif
620 __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0], 654 __ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
621 (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16); 655 (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
622#ifndef __LITTLE_ENDIAN__ 656#ifndef __LITTLE_ENDIAN__
623 __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0); 657 __ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
624#endif 658#endif
625 return __ret; 659 return __ret;
626#endif 660#endif
627} 661}
628 662
629#endif /* !defined(__aarch64__) */ 663#endif /* !defined(__aarch64__) */
630 664
631#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */ 665#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */

cvs diff -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c (switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/27 20:58:06 1.4
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/27 20:58:56 1.5
@@ -1,369 +1,377 @@ @@ -1,369 +1,377 @@
1/* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */ 1/* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/types.h> 29#include <sys/types.h>
30#include <sys/endian.h> 30#include <sys/endian.h>
31 31
32#include "arm_neon.h" 32#include "arm_neon.h"
33#include "chacha_neon.h" 33#include "chacha_neon.h"
34 34
35static inline uint32x4_t 35static inline uint32x4_t
36vrolq_n_u32(uint32x4_t x, uint8_t n) 36vrolq_n_u32(uint32x4_t x, uint8_t n)
37{ 37{
38 38
 39 /*
 40 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
 41 * practice it hurts performance at least on Cortex-A8.
 42 */
 43#if 1
39 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); 44 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
 45#else
 46 return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
 47#endif
40} 48}
41 49
42static inline uint32x4_t 50static inline uint32x4_t
43vhtole_u32(uint32x4_t x) 51vhtole_u32(uint32x4_t x)
44{ 52{
45#if _BYTE_ORDER == _LITTLE_ENDIAN 53#if _BYTE_ORDER == _LITTLE_ENDIAN
46 return x; 54 return x;
47#elif _BYTE_ORDER == _BIG_ENDIAN 55#elif _BYTE_ORDER == _BIG_ENDIAN
48 return vrev32q_u8(x); 56 return vrev32q_u8(x);
49#endif 57#endif
50} 58}
51 59
52static inline uint32x4_t 60static inline uint32x4_t
53vletoh_u32(uint32x4_t x) 61vletoh_u32(uint32x4_t x)
54{ 62{
55#if _BYTE_ORDER == _LITTLE_ENDIAN 63#if _BYTE_ORDER == _LITTLE_ENDIAN
56 return x; 64 return x;
57#elif _BYTE_ORDER == _BIG_ENDIAN 65#elif _BYTE_ORDER == _BIG_ENDIAN
58 return vrev32q_u8(x); 66 return vrev32q_u8(x);
59#endif 67#endif
60} 68}
61  69
62static inline uint32x4_t 70static inline uint32x4_t
63rol16(uint32x4_t x) 71rol16(uint32x4_t x)
64{ 72{
65 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x); 73 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
66 74
67 y16 = vrev32q_u16(x16); 75 y16 = vrev32q_u16(x16);
68 76
69 return vreinterpretq_u32_u16(y16); 77 return vreinterpretq_u32_u16(y16);
70} 78}
71 79
72static inline uint32x4_t 80static inline uint32x4_t
73rol12(uint32x4_t x) 81rol12(uint32x4_t x)
74{ 82{
75 83
76 return vrolq_n_u32(x, 12); 84 return vrolq_n_u32(x, 12);
77} 85}
78 86
79static inline uint32x4_t 87static inline uint32x4_t
80rol8(uint32x4_t x) 88rol8(uint32x4_t x)
81{ 89{
82#if defined(__aarch64__) 90#if defined(__aarch64__)
83 static const uint8x16_t rol8_tab = { 91 static const uint8x16_t rol8_tab = {
84 3, 0, 1, 2, 7, 4, 5, 6, 92 3, 0, 1, 2, 7, 4, 5, 6,
85 11, 8, 9,10, 15,12,13,14, 93 11, 8, 9,10, 15,12,13,14,
86 }; 94 };
87 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); 95 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
88 96
89 y8 = vqtbl1q_u8(x8, rol8_tab); 97 y8 = vqtbl1q_u8(x8, rol8_tab);
90 98
91 return vreinterpretq_u32_u8(y8); 99 return vreinterpretq_u32_u8(y8);
92#elif 0 100#elif 0
93 /* 101 /*
94 * GCC does a lousy job with this, spilling two 64-bit vector 102 * GCC does a lousy job with this, spilling two 64-bit vector
95 * registers to the stack every time. There should be plenty 103 * registers to the stack every time. There should be plenty
96 * of vector registers free, requiring no spills at all, and 104 * of vector registers free, requiring no spills at all, and
97 * GCC should be able to hoist the load of rol8_tab out of any 105 * GCC should be able to hoist the load of rol8_tab out of any
98 * loops, but it doesn't and so attempting to use VTBL hurts 106 * loops, but it doesn't and so attempting to use VTBL hurts
99 * more than it helps. 107 * more than it helps.
100 */ 108 */
101 static const uint8x8_t rol8_tab = { 109 static const uint8x8_t rol8_tab = {
102 3, 0, 1, 2, 7, 4, 5, 6, 110 3, 0, 1, 2, 7, 4, 5, 6,
103 }; 111 };
104 112
105 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); 113 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
106 114
107 y64 = (uint64x2_t) { 115 y64 = (uint64x2_t) {
108 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab), 116 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
109 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab), 117 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
110 }; 118 };
111 119
112 return vreinterpretq_u32_u64(y64); 120 return vreinterpretq_u32_u64(y64);
113#else 121#else
114 return vrolq_n_u32(x, 8); 122 return vrolq_n_u32(x, 8);
115#endif 123#endif
116} 124}
117 125
118static inline uint32x4_t 126static inline uint32x4_t
119rol7(uint32x4_t x) 127rol7(uint32x4_t x)
120{ 128{
121 129
122 return vrolq_n_u32(x, 7); 130 return vrolq_n_u32(x, 7);
123} 131}
124  132
125static inline void 133static inline void
126chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, 134chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
127 unsigned nr) 135 unsigned nr)
128{ 136{
129 uint32x4_t r0, r1, r2, r3; 137 uint32x4_t r0, r1, r2, r3;
130 uint32x4_t c0, c1, c2, c3; 138 uint32x4_t c0, c1, c2, c3;
131 139
132 r0 = *p0; 140 r0 = *p0;
133 r1 = *p1; 141 r1 = *p1;
134 r2 = *p2; 142 r2 = *p2;
135 r3 = *p3; 143 r3 = *p3;
136 144
137 for (; nr > 0; nr -= 2) { 145 for (; nr > 0; nr -= 2) {
138 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3); 146 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
139 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1); 147 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
140 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3); 148 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
141 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1); 149 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
142 150
143 c0 = r0; 151 c0 = r0;
144 c1 = vextq_u32(r1, r1, 1); 152 c1 = vextq_u32(r1, r1, 1);
145 c2 = vextq_u32(r2, r2, 2); 153 c2 = vextq_u32(r2, r2, 2);
146 c3 = vextq_u32(r3, r3, 3); 154 c3 = vextq_u32(r3, r3, 3);
147 155
148 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3); 156 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
149 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1); 157 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
150 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3); 158 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
151 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1); 159 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
152 160
153 r0 = c0; 161 r0 = c0;
154 r1 = vextq_u32(c1, c1, 3); 162 r1 = vextq_u32(c1, c1, 3);
155 r2 = vextq_u32(c2, c2, 2); 163 r2 = vextq_u32(c2, c2, 2);
156 r3 = vextq_u32(c3, c3, 1); 164 r3 = vextq_u32(c3, c3, 1);
157 } 165 }
158 166
159 *p0 = r0; 167 *p0 = r0;
160 *p1 = r1; 168 *p1 = r1;
161 *p2 = r2; 169 *p2 = r2;
162 *p3 = r3; 170 *p3 = r3;
163} 171}
164  172
165void 173void
166chacha_core_neon(uint8_t out[restrict static 64], 174chacha_core_neon(uint8_t out[restrict static 64],
167 const uint8_t in[static 16], 175 const uint8_t in[static 16],
168 const uint8_t k[static 32], 176 const uint8_t k[static 32],
169 const uint8_t c[static 16], 177 const uint8_t c[static 16],
170 unsigned nr) 178 unsigned nr)
171{ 179{
172 uint32x4_t in0, in1, in2, in3; 180 uint32x4_t in0, in1, in2, in3;
173 uint32x4_t r0, r1, r2, r3; 181 uint32x4_t r0, r1, r2, r3;
174 182
175 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); 183 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
176 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 184 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
177 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 185 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
178 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); 186 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
179 187
180 chacha_permute(&r0, &r1, &r2, &r3, nr); 188 chacha_permute(&r0, &r1, &r2, &r3, nr);
181 189
182 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); 190 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
183 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); 191 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
184 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); 192 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
185 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); 193 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
186} 194}
187 195
188void 196void
189hchacha_neon(uint8_t out[restrict static 32], 197hchacha_neon(uint8_t out[restrict static 32],
190 const uint8_t in[static 16], 198 const uint8_t in[static 16],
191 const uint8_t k[static 32], 199 const uint8_t k[static 32],
192 const uint8_t c[static 16], 200 const uint8_t c[static 16],
193 unsigned nr) 201 unsigned nr)
194{ 202{
195 uint32x4_t r0, r1, r2, r3; 203 uint32x4_t r0, r1, r2, r3;
196 204
197 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); 205 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
198 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 206 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
199 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 207 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
200 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); 208 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
201 209
202 chacha_permute(&r0, &r1, &r2, &r3, nr); 210 chacha_permute(&r0, &r1, &r2, &r3, nr);
203 211
204 vst1q_u32((uint32_t *)out + 0, r0); 212 vst1q_u32((uint32_t *)out + 0, r0);
205 vst1q_u32((uint32_t *)out + 4, r3); 213 vst1q_u32((uint32_t *)out + 4, r3);
206} 214}
207  215
208void 216void
209chacha_stream_neon(uint8_t *restrict s, size_t n, 217chacha_stream_neon(uint8_t *restrict s, size_t n,
210 uint32_t blkno, 218 uint32_t blkno,
211 const uint8_t nonce[static 12], 219 const uint8_t nonce[static 12],
212 const uint8_t k[static 32], 220 const uint8_t k[static 32],
213 unsigned nr) 221 unsigned nr)
214{ 222{
215 223
216#ifdef __aarch64__ 224#ifdef __aarch64__
217 for (; n >= 256; s += 256, n -= 256, blkno += 4) 225 for (; n >= 256; s += 256, n -= 256, blkno += 4)
218 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); 226 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
219#endif 227#endif
220 228
221 if (n) { 229 if (n) {
222 const uint32x4_t blkno_inc = {1,0,0,0}; 230 const uint32x4_t blkno_inc = {1,0,0,0};
223 uint32x4_t in0, in1, in2, in3; 231 uint32x4_t in0, in1, in2, in3;
224 uint32x4_t r0, r1, r2, r3; 232 uint32x4_t r0, r1, r2, r3;
225 233
226 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); 234 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
227 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 235 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
228 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 236 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
229 in3 = (uint32x4_t) { 237 in3 = (uint32x4_t) {
230 blkno, 238 blkno,
231 le32dec(nonce), 239 le32dec(nonce),
232 le32dec(nonce + 4), 240 le32dec(nonce + 4),
233 le32dec(nonce + 8) 241 le32dec(nonce + 8)
234 }; 242 };
235 243
236 for (; n; s += 64, n -= 64) { 244 for (; n; s += 64, n -= 64) {
237 r0 = in0; 245 r0 = in0;
238 r1 = in1; 246 r1 = in1;
239 r2 = in2; 247 r2 = in2;
240 r3 = in3; 248 r3 = in3;
241 chacha_permute(&r0, &r1, &r2, &r3, nr); 249 chacha_permute(&r0, &r1, &r2, &r3, nr);
242 r0 = vhtole_u32(vaddq_u32(r0, in0)); 250 r0 = vhtole_u32(vaddq_u32(r0, in0));
243 r1 = vhtole_u32(vaddq_u32(r1, in1)); 251 r1 = vhtole_u32(vaddq_u32(r1, in1));
244 r2 = vhtole_u32(vaddq_u32(r2, in2)); 252 r2 = vhtole_u32(vaddq_u32(r2, in2));
245 r3 = vhtole_u32(vaddq_u32(r3, in3)); 253 r3 = vhtole_u32(vaddq_u32(r3, in3));
246 254
247 if (n < 64) { 255 if (n < 64) {
248 uint8_t buf[64] __aligned(16); 256 uint8_t buf[64] __aligned(16);
249 257
250 vst1q_u32((uint32_t *)buf + 4*0, r0); 258 vst1q_u32((uint32_t *)buf + 4*0, r0);
251 vst1q_u32((uint32_t *)buf + 4*1, r1); 259 vst1q_u32((uint32_t *)buf + 4*1, r1);
252 vst1q_u32((uint32_t *)buf + 4*2, r2); 260 vst1q_u32((uint32_t *)buf + 4*2, r2);
253 vst1q_u32((uint32_t *)buf + 4*3, r3); 261 vst1q_u32((uint32_t *)buf + 4*3, r3);
254 memcpy(s, buf, n); 262 memcpy(s, buf, n);
255 263
256 break; 264 break;
257 } 265 }
258 266
259 vst1q_u32((uint32_t *)s + 4*0, r0); 267 vst1q_u32((uint32_t *)s + 4*0, r0);
260 vst1q_u32((uint32_t *)s + 4*1, r1); 268 vst1q_u32((uint32_t *)s + 4*1, r1);
261 vst1q_u32((uint32_t *)s + 4*2, r2); 269 vst1q_u32((uint32_t *)s + 4*2, r2);
262 vst1q_u32((uint32_t *)s + 4*3, r3); 270 vst1q_u32((uint32_t *)s + 4*3, r3);
263 in3 = vaddq_u32(in3, blkno_inc); 271 in3 = vaddq_u32(in3, blkno_inc);
264 } 272 }
265 } 273 }
266} 274}
267  275
268void 276void
269chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, 277chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
270 uint32_t blkno, 278 uint32_t blkno,
271 const uint8_t nonce[static 12], 279 const uint8_t nonce[static 12],
272 const uint8_t k[static 32], 280 const uint8_t k[static 32],
273 unsigned nr) 281 unsigned nr)
274{ 282{
275 283
276#ifdef __aarch64__ 284#ifdef __aarch64__
277 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) 285 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
278 chacha_stream_xor256_neon(s, p, blkno, nonce, k, 286 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
279 chacha_const32, nr); 287 chacha_const32, nr);
280#endif 288#endif
281 289
282 if (n) { 290 if (n) {
283 const uint32x4_t blkno_inc = {1,0,0,0}; 291 const uint32x4_t blkno_inc = {1,0,0,0};
284 uint32x4_t in0, in1, in2, in3; 292 uint32x4_t in0, in1, in2, in3;
285 uint32x4_t r0, r1, r2, r3; 293 uint32x4_t r0, r1, r2, r3;
286 294
287 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); 295 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
288 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 296 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
289 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 297 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
290 in3 = (uint32x4_t) { 298 in3 = (uint32x4_t) {
291 blkno, 299 blkno,
292 le32dec(nonce), 300 le32dec(nonce),
293 le32dec(nonce + 4), 301 le32dec(nonce + 4),
294 le32dec(nonce + 8) 302 le32dec(nonce + 8)
295 }; 303 };
296 304
297 for (; n; s += 64, p += 64, n -= 64) { 305 for (; n; s += 64, p += 64, n -= 64) {
298 r0 = in0; 306 r0 = in0;
299 r1 = in1; 307 r1 = in1;
300 r2 = in2; 308 r2 = in2;
301 r3 = in3; 309 r3 = in3;
302 chacha_permute(&r0, &r1, &r2, &r3, nr); 310 chacha_permute(&r0, &r1, &r2, &r3, nr);
303 r0 = vhtole_u32(vaddq_u32(r0, in0)); 311 r0 = vhtole_u32(vaddq_u32(r0, in0));
304 r1 = vhtole_u32(vaddq_u32(r1, in1)); 312 r1 = vhtole_u32(vaddq_u32(r1, in1));
305 r2 = vhtole_u32(vaddq_u32(r2, in2)); 313 r2 = vhtole_u32(vaddq_u32(r2, in2));
306 r3 = vhtole_u32(vaddq_u32(r3, in3)); 314 r3 = vhtole_u32(vaddq_u32(r3, in3));
307 315
308 if (n < 64) { 316 if (n < 64) {
309 uint8_t buf[64] __aligned(16); 317 uint8_t buf[64] __aligned(16);
310 unsigned i; 318 unsigned i;
311 319
312 vst1q_u32((uint32_t *)buf + 4*0, r0); 320 vst1q_u32((uint32_t *)buf + 4*0, r0);
313 vst1q_u32((uint32_t *)buf + 4*1, r1); 321 vst1q_u32((uint32_t *)buf + 4*1, r1);
314 vst1q_u32((uint32_t *)buf + 4*2, r2); 322 vst1q_u32((uint32_t *)buf + 4*2, r2);
315 vst1q_u32((uint32_t *)buf + 4*3, r3); 323 vst1q_u32((uint32_t *)buf + 4*3, r3);
316 324
317 for (i = 0; i < n - n%4; i += 4) 325 for (i = 0; i < n - n%4; i += 4)
318 le32enc(s + i, 326 le32enc(s + i,
319 le32dec(p + i) ^ le32dec(buf + i)); 327 le32dec(p + i) ^ le32dec(buf + i));
320 for (; i < n; i++) 328 for (; i < n; i++)
321 s[i] = p[i] ^ buf[i]; 329 s[i] = p[i] ^ buf[i];
322 330
323 break; 331 break;
324 } 332 }
325 333
326 r0 ^= vld1q_u32((const uint32_t *)p + 4*0); 334 r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
327 r1 ^= vld1q_u32((const uint32_t *)p + 4*1); 335 r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
328 r2 ^= vld1q_u32((const uint32_t *)p + 4*2); 336 r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
329 r3 ^= vld1q_u32((const uint32_t *)p + 4*3); 337 r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
330 vst1q_u32((uint32_t *)s + 4*0, r0); 338 vst1q_u32((uint32_t *)s + 4*0, r0);
331 vst1q_u32((uint32_t *)s + 4*1, r1); 339 vst1q_u32((uint32_t *)s + 4*1, r1);
332 vst1q_u32((uint32_t *)s + 4*2, r2); 340 vst1q_u32((uint32_t *)s + 4*2, r2);
333 vst1q_u32((uint32_t *)s + 4*3, r3); 341 vst1q_u32((uint32_t *)s + 4*3, r3);
334 in3 = vaddq_u32(in3, blkno_inc); 342 in3 = vaddq_u32(in3, blkno_inc);
335 } 343 }
336 } 344 }
337} 345}
338  346
339void 347void
340xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, 348xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
341 uint32_t blkno, 349 uint32_t blkno,
342 const uint8_t nonce[static 24], 350 const uint8_t nonce[static 24],
343 const uint8_t k[static 32], 351 const uint8_t k[static 32],
344 unsigned nr) 352 unsigned nr)
345{ 353{
346 uint8_t subkey[32]; 354 uint8_t subkey[32];
347 uint8_t subnonce[12]; 355 uint8_t subnonce[12];
348 356
349 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 357 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
350 memset(subnonce, 0, 4); 358 memset(subnonce, 0, 4);
351 memcpy(subnonce + 4, nonce + 16, 8); 359 memcpy(subnonce + 4, nonce + 16, 8);
352 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); 360 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
353} 361}
354 362
355void 363void
356xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 364xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
357 uint32_t blkno, 365 uint32_t blkno,
358 const uint8_t nonce[static 24], 366 const uint8_t nonce[static 24],
359 const uint8_t k[static 32], 367 const uint8_t k[static 32],
360 unsigned nr) 368 unsigned nr)
361{ 369{
362 uint8_t subkey[32]; 370 uint8_t subkey[32];
363 uint8_t subnonce[12]; 371 uint8_t subnonce[12];
364 372
365 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 373 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
366 memset(subnonce, 0, 4); 374 memset(subnonce, 0, 4);
367 memcpy(subnonce + 4, nonce + 16, 8); 375 memcpy(subnonce + 4, nonce + 16, 8);
368 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); 376 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
369} 377}