Tue Jul 28 20:08:48 2020 UTC ()
Implement 4-way vectorization of ChaCha for armv7 NEON.

cgd performance is not as good as I was hoping (~4% improvement over
chacha_ref.c) but it should improve substantially more if we let the
cgd worker thread keep fpu state so we don't have to pay the cost of
isb and zero-the-fpu on every 512-byte cgd block.


(riastradh)
diff -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon.h
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm
diff -r0 -r1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
diff -r1.2 -r1.3 src/tests/sys/crypto/chacha/Makefile

cvs diff -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c (switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/28 20:05:33 1.6
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/28 20:08:48 1.7
@@ -1,377 +1,373 @@ @@ -1,377 +1,373 @@
1/* $NetBSD: chacha_neon.c,v 1.6 2020/07/28 20:05:33 riastradh Exp $ */ 1/* $NetBSD: chacha_neon.c,v 1.7 2020/07/28 20:08:48 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/types.h> 29#include <sys/types.h>
30#include <sys/endian.h> 30#include <sys/endian.h>
31 31
32#include "arm_neon.h" 32#include "arm_neon.h"
33#include "chacha_neon.h" 33#include "chacha_neon.h"
34 34
35static inline uint32x4_t 35static inline uint32x4_t
36vrolq_n_u32(uint32x4_t x, uint8_t n) 36vrolq_n_u32(uint32x4_t x, uint8_t n)
37{ 37{
38 38
39 /* 39 /*
40 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in 40 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
41 * practice it hurts performance at least on Cortex-A8. 41 * practice it hurts performance at least on Cortex-A8.
42 */ 42 */
43#if 1 43#if 1
44 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); 44 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
45#else 45#else
46 return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n); 46 return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
47#endif 47#endif
48} 48}
49 49
50static inline uint32x4_t 50static inline uint32x4_t
51vhtole_u32(uint32x4_t x) 51vhtole_u32(uint32x4_t x)
52{ 52{
53#if _BYTE_ORDER == _LITTLE_ENDIAN 53#if _BYTE_ORDER == _LITTLE_ENDIAN
54 return x; 54 return x;
55#elif _BYTE_ORDER == _BIG_ENDIAN 55#elif _BYTE_ORDER == _BIG_ENDIAN
56 return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); 56 return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
57#endif 57#endif
58} 58}
59 59
60static inline uint32x4_t 60static inline uint32x4_t
61vletoh_u32(uint32x4_t x) 61vletoh_u32(uint32x4_t x)
62{ 62{
63#if _BYTE_ORDER == _LITTLE_ENDIAN 63#if _BYTE_ORDER == _LITTLE_ENDIAN
64 return x; 64 return x;
65#elif _BYTE_ORDER == _BIG_ENDIAN 65#elif _BYTE_ORDER == _BIG_ENDIAN
66 return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); 66 return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)));
67#endif 67#endif
68} 68}
69  69
70static inline uint32x4_t 70static inline uint32x4_t
71rol16(uint32x4_t x) 71rol16(uint32x4_t x)
72{ 72{
73 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x); 73 uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
74 74
75 y16 = vrev32q_u16(x16); 75 y16 = vrev32q_u16(x16);
76 76
77 return vreinterpretq_u32_u16(y16); 77 return vreinterpretq_u32_u16(y16);
78} 78}
79 79
80static inline uint32x4_t 80static inline uint32x4_t
81rol12(uint32x4_t x) 81rol12(uint32x4_t x)
82{ 82{
83 83
84 return vrolq_n_u32(x, 12); 84 return vrolq_n_u32(x, 12);
85} 85}
86 86
87static inline uint32x4_t 87static inline uint32x4_t
88rol8(uint32x4_t x) 88rol8(uint32x4_t x)
89{ 89{
90#if defined(__aarch64__) 90#if defined(__aarch64__)
91 static const uint8x16_t rol8_tab = { 91 static const uint8x16_t rol8_tab = {
92 3, 0, 1, 2, 7, 4, 5, 6, 92 3, 0, 1, 2, 7, 4, 5, 6,
93 11, 8, 9,10, 15,12,13,14, 93 11, 8, 9,10, 15,12,13,14,
94 }; 94 };
95 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); 95 uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
96 96
97 y8 = vqtbl1q_u8(x8, rol8_tab); 97 y8 = vqtbl1q_u8(x8, rol8_tab);
98 98
99 return vreinterpretq_u32_u8(y8); 99 return vreinterpretq_u32_u8(y8);
100#elif 0 100#elif 0
101 /* 101 /*
102 * GCC does a lousy job with this, spilling two 64-bit vector 102 * GCC does a lousy job with this, spilling two 64-bit vector
103 * registers to the stack every time. There should be plenty 103 * registers to the stack every time. There should be plenty
104 * of vector registers free, requiring no spills at all, and 104 * of vector registers free, requiring no spills at all, and
105 * GCC should be able to hoist the load of rol8_tab out of any 105 * GCC should be able to hoist the load of rol8_tab out of any
106 * loops, but it doesn't and so attempting to use VTBL hurts 106 * loops, but it doesn't and so attempting to use VTBL hurts
107 * more than it helps. 107 * more than it helps.
108 */ 108 */
109 static const uint8x8_t rol8_tab = { 109 static const uint8x8_t rol8_tab = {
110 3, 0, 1, 2, 7, 4, 5, 6, 110 3, 0, 1, 2, 7, 4, 5, 6,
111 }; 111 };
112 112
113 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); 113 uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
114 114
115 y64 = (uint64x2_t) { 115 y64 = (uint64x2_t) {
116 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab), 116 (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
117 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab), 117 (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
118 }; 118 };
119 119
120 return vreinterpretq_u32_u64(y64); 120 return vreinterpretq_u32_u64(y64);
121#else 121#else
122 return vrolq_n_u32(x, 8); 122 return vrolq_n_u32(x, 8);
123#endif 123#endif
124} 124}
125 125
126static inline uint32x4_t 126static inline uint32x4_t
127rol7(uint32x4_t x) 127rol7(uint32x4_t x)
128{ 128{
129 129
130 return vrolq_n_u32(x, 7); 130 return vrolq_n_u32(x, 7);
131} 131}
132  132
133static inline void 133static inline void
134chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, 134chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
135 unsigned nr) 135 unsigned nr)
136{ 136{
137 uint32x4_t r0, r1, r2, r3; 137 uint32x4_t r0, r1, r2, r3;
138 uint32x4_t c0, c1, c2, c3; 138 uint32x4_t c0, c1, c2, c3;
139 139
140 r0 = *p0; 140 r0 = *p0;
141 r1 = *p1; 141 r1 = *p1;
142 r2 = *p2; 142 r2 = *p2;
143 r3 = *p3; 143 r3 = *p3;
144 144
145 for (; nr > 0; nr -= 2) { 145 for (; nr > 0; nr -= 2) {
146 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3); 146 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
147 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1); 147 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
148 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3); 148 r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
149 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1); 149 r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
150 150
151 c0 = r0; 151 c0 = r0;
152 c1 = vextq_u32(r1, r1, 1); 152 c1 = vextq_u32(r1, r1, 1);
153 c2 = vextq_u32(r2, r2, 2); 153 c2 = vextq_u32(r2, r2, 2);
154 c3 = vextq_u32(r3, r3, 3); 154 c3 = vextq_u32(r3, r3, 3);
155 155
156 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3); 156 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
157 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1); 157 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
158 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3); 158 c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
159 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1); 159 c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
160 160
161 r0 = c0; 161 r0 = c0;
162 r1 = vextq_u32(c1, c1, 3); 162 r1 = vextq_u32(c1, c1, 3);
163 r2 = vextq_u32(c2, c2, 2); 163 r2 = vextq_u32(c2, c2, 2);
164 r3 = vextq_u32(c3, c3, 1); 164 r3 = vextq_u32(c3, c3, 1);
165 } 165 }
166 166
167 *p0 = r0; 167 *p0 = r0;
168 *p1 = r1; 168 *p1 = r1;
169 *p2 = r2; 169 *p2 = r2;
170 *p3 = r3; 170 *p3 = r3;
171} 171}
172  172
173void 173void
174chacha_core_neon(uint8_t out[restrict static 64], 174chacha_core_neon(uint8_t out[restrict static 64],
175 const uint8_t in[static 16], 175 const uint8_t in[static 16],
176 const uint8_t k[static 32], 176 const uint8_t k[static 32],
177 const uint8_t c[static 16], 177 const uint8_t c[static 16],
178 unsigned nr) 178 unsigned nr)
179{ 179{
180 uint32x4_t in0, in1, in2, in3; 180 uint32x4_t in0, in1, in2, in3;
181 uint32x4_t r0, r1, r2, r3; 181 uint32x4_t r0, r1, r2, r3;
182 182
183 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); 183 r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
184 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 184 r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
185 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 185 r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
186 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); 186 r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
187 187
188 chacha_permute(&r0, &r1, &r2, &r3, nr); 188 chacha_permute(&r0, &r1, &r2, &r3, nr);
189 189
190 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); 190 vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
191 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); 191 vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
192 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); 192 vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
193 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); 193 vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
194} 194}
195 195
196void 196void
197hchacha_neon(uint8_t out[restrict static 32], 197hchacha_neon(uint8_t out[restrict static 32],
198 const uint8_t in[static 16], 198 const uint8_t in[static 16],
199 const uint8_t k[static 32], 199 const uint8_t k[static 32],
200 const uint8_t c[static 16], 200 const uint8_t c[static 16],
201 unsigned nr) 201 unsigned nr)
202{ 202{
203 uint32x4_t r0, r1, r2, r3; 203 uint32x4_t r0, r1, r2, r3;
204 204
205 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); 205 r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
206 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 206 r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
207 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 207 r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
208 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); 208 r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
209 209
210 chacha_permute(&r0, &r1, &r2, &r3, nr); 210 chacha_permute(&r0, &r1, &r2, &r3, nr);
211 211
212 vst1q_u32((uint32_t *)out + 0, r0); 212 vst1q_u32((uint32_t *)out + 0, r0);
213 vst1q_u32((uint32_t *)out + 4, r3); 213 vst1q_u32((uint32_t *)out + 4, r3);
214} 214}
215  215
216void 216void
217chacha_stream_neon(uint8_t *restrict s, size_t n, 217chacha_stream_neon(uint8_t *restrict s, size_t n,
218 uint32_t blkno, 218 uint32_t blkno,
219 const uint8_t nonce[static 12], 219 const uint8_t nonce[static 12],
220 const uint8_t k[static 32], 220 const uint8_t k[static 32],
221 unsigned nr) 221 unsigned nr)
222{ 222{
223 223
224#ifdef __aarch64__ 
225 for (; n >= 256; s += 256, n -= 256, blkno += 4) 224 for (; n >= 256; s += 256, n -= 256, blkno += 4)
226 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); 225 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
227#endif 
228 226
229 if (n) { 227 if (n) {
230 const uint32x4_t blkno_inc = {1,0,0,0}; 228 const uint32x4_t blkno_inc = {1,0,0,0};
231 uint32x4_t in0, in1, in2, in3; 229 uint32x4_t in0, in1, in2, in3;
232 uint32x4_t r0, r1, r2, r3; 230 uint32x4_t r0, r1, r2, r3;
233 231
234 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); 232 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
235 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 233 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
236 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 234 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
237 in3 = (uint32x4_t) { 235 in3 = (uint32x4_t) {
238 blkno, 236 blkno,
239 le32dec(nonce), 237 le32dec(nonce),
240 le32dec(nonce + 4), 238 le32dec(nonce + 4),
241 le32dec(nonce + 8) 239 le32dec(nonce + 8)
242 }; 240 };
243 241
244 for (; n; s += 64, n -= 64) { 242 for (; n; s += 64, n -= 64) {
245 r0 = in0; 243 r0 = in0;
246 r1 = in1; 244 r1 = in1;
247 r2 = in2; 245 r2 = in2;
248 r3 = in3; 246 r3 = in3;
249 chacha_permute(&r0, &r1, &r2, &r3, nr); 247 chacha_permute(&r0, &r1, &r2, &r3, nr);
250 r0 = vhtole_u32(vaddq_u32(r0, in0)); 248 r0 = vhtole_u32(vaddq_u32(r0, in0));
251 r1 = vhtole_u32(vaddq_u32(r1, in1)); 249 r1 = vhtole_u32(vaddq_u32(r1, in1));
252 r2 = vhtole_u32(vaddq_u32(r2, in2)); 250 r2 = vhtole_u32(vaddq_u32(r2, in2));
253 r3 = vhtole_u32(vaddq_u32(r3, in3)); 251 r3 = vhtole_u32(vaddq_u32(r3, in3));
254 252
255 if (n < 64) { 253 if (n < 64) {
256 uint8_t buf[64] __aligned(16); 254 uint8_t buf[64] __aligned(16);
257 255
258 vst1q_u32((uint32_t *)buf + 4*0, r0); 256 vst1q_u32((uint32_t *)buf + 4*0, r0);
259 vst1q_u32((uint32_t *)buf + 4*1, r1); 257 vst1q_u32((uint32_t *)buf + 4*1, r1);
260 vst1q_u32((uint32_t *)buf + 4*2, r2); 258 vst1q_u32((uint32_t *)buf + 4*2, r2);
261 vst1q_u32((uint32_t *)buf + 4*3, r3); 259 vst1q_u32((uint32_t *)buf + 4*3, r3);
262 memcpy(s, buf, n); 260 memcpy(s, buf, n);
263 261
264 break; 262 break;
265 } 263 }
266 264
267 vst1q_u32((uint32_t *)s + 4*0, r0); 265 vst1q_u32((uint32_t *)s + 4*0, r0);
268 vst1q_u32((uint32_t *)s + 4*1, r1); 266 vst1q_u32((uint32_t *)s + 4*1, r1);
269 vst1q_u32((uint32_t *)s + 4*2, r2); 267 vst1q_u32((uint32_t *)s + 4*2, r2);
270 vst1q_u32((uint32_t *)s + 4*3, r3); 268 vst1q_u32((uint32_t *)s + 4*3, r3);
271 in3 = vaddq_u32(in3, blkno_inc); 269 in3 = vaddq_u32(in3, blkno_inc);
272 } 270 }
273 } 271 }
274} 272}
275  273
276void 274void
277chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, 275chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
278 uint32_t blkno, 276 uint32_t blkno,
279 const uint8_t nonce[static 12], 277 const uint8_t nonce[static 12],
280 const uint8_t k[static 32], 278 const uint8_t k[static 32],
281 unsigned nr) 279 unsigned nr)
282{ 280{
283 281
284#ifdef __aarch64__ 
285 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) 282 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
286 chacha_stream_xor256_neon(s, p, blkno, nonce, k, 283 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
287 chacha_const32, nr); 284 chacha_const32, nr);
288#endif 
289 285
290 if (n) { 286 if (n) {
291 const uint32x4_t blkno_inc = {1,0,0,0}; 287 const uint32x4_t blkno_inc = {1,0,0,0};
292 uint32x4_t in0, in1, in2, in3; 288 uint32x4_t in0, in1, in2, in3;
293 uint32x4_t r0, r1, r2, r3; 289 uint32x4_t r0, r1, r2, r3;
294 290
295 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); 291 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
296 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 292 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
297 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 293 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
298 in3 = (uint32x4_t) { 294 in3 = (uint32x4_t) {
299 blkno, 295 blkno,
300 le32dec(nonce), 296 le32dec(nonce),
301 le32dec(nonce + 4), 297 le32dec(nonce + 4),
302 le32dec(nonce + 8) 298 le32dec(nonce + 8)
303 }; 299 };
304 300
305 for (; n; s += 64, p += 64, n -= 64) { 301 for (; n; s += 64, p += 64, n -= 64) {
306 r0 = in0; 302 r0 = in0;
307 r1 = in1; 303 r1 = in1;
308 r2 = in2; 304 r2 = in2;
309 r3 = in3; 305 r3 = in3;
310 chacha_permute(&r0, &r1, &r2, &r3, nr); 306 chacha_permute(&r0, &r1, &r2, &r3, nr);
311 r0 = vhtole_u32(vaddq_u32(r0, in0)); 307 r0 = vhtole_u32(vaddq_u32(r0, in0));
312 r1 = vhtole_u32(vaddq_u32(r1, in1)); 308 r1 = vhtole_u32(vaddq_u32(r1, in1));
313 r2 = vhtole_u32(vaddq_u32(r2, in2)); 309 r2 = vhtole_u32(vaddq_u32(r2, in2));
314 r3 = vhtole_u32(vaddq_u32(r3, in3)); 310 r3 = vhtole_u32(vaddq_u32(r3, in3));
315 311
316 if (n < 64) { 312 if (n < 64) {
317 uint8_t buf[64] __aligned(16); 313 uint8_t buf[64] __aligned(16);
318 unsigned i; 314 unsigned i;
319 315
320 vst1q_u32((uint32_t *)buf + 4*0, r0); 316 vst1q_u32((uint32_t *)buf + 4*0, r0);
321 vst1q_u32((uint32_t *)buf + 4*1, r1); 317 vst1q_u32((uint32_t *)buf + 4*1, r1);
322 vst1q_u32((uint32_t *)buf + 4*2, r2); 318 vst1q_u32((uint32_t *)buf + 4*2, r2);
323 vst1q_u32((uint32_t *)buf + 4*3, r3); 319 vst1q_u32((uint32_t *)buf + 4*3, r3);
324 320
325 for (i = 0; i < n - n%4; i += 4) 321 for (i = 0; i < n - n%4; i += 4)
326 le32enc(s + i, 322 le32enc(s + i,
327 le32dec(p + i) ^ le32dec(buf + i)); 323 le32dec(p + i) ^ le32dec(buf + i));
328 for (; i < n; i++) 324 for (; i < n; i++)
329 s[i] = p[i] ^ buf[i]; 325 s[i] = p[i] ^ buf[i];
330 326
331 break; 327 break;
332 } 328 }
333 329
334 r0 ^= vld1q_u32((const uint32_t *)p + 4*0); 330 r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
335 r1 ^= vld1q_u32((const uint32_t *)p + 4*1); 331 r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
336 r2 ^= vld1q_u32((const uint32_t *)p + 4*2); 332 r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
337 r3 ^= vld1q_u32((const uint32_t *)p + 4*3); 333 r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
338 vst1q_u32((uint32_t *)s + 4*0, r0); 334 vst1q_u32((uint32_t *)s + 4*0, r0);
339 vst1q_u32((uint32_t *)s + 4*1, r1); 335 vst1q_u32((uint32_t *)s + 4*1, r1);
340 vst1q_u32((uint32_t *)s + 4*2, r2); 336 vst1q_u32((uint32_t *)s + 4*2, r2);
341 vst1q_u32((uint32_t *)s + 4*3, r3); 337 vst1q_u32((uint32_t *)s + 4*3, r3);
342 in3 = vaddq_u32(in3, blkno_inc); 338 in3 = vaddq_u32(in3, blkno_inc);
343 } 339 }
344 } 340 }
345} 341}
346  342
347void 343void
348xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, 344xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
349 uint32_t blkno, 345 uint32_t blkno,
350 const uint8_t nonce[static 24], 346 const uint8_t nonce[static 24],
351 const uint8_t k[static 32], 347 const uint8_t k[static 32],
352 unsigned nr) 348 unsigned nr)
353{ 349{
354 uint8_t subkey[32]; 350 uint8_t subkey[32];
355 uint8_t subnonce[12]; 351 uint8_t subnonce[12];
356 352
357 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 353 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
358 memset(subnonce, 0, 4); 354 memset(subnonce, 0, 4);
359 memcpy(subnonce + 4, nonce + 16, 8); 355 memcpy(subnonce + 4, nonce + 16, 8);
360 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); 356 chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
361} 357}
362 358
363void 359void
364xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, 360xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
365 uint32_t blkno, 361 uint32_t blkno,
366 const uint8_t nonce[static 24], 362 const uint8_t nonce[static 24],
367 const uint8_t k[static 32], 363 const uint8_t k[static 32],
368 unsigned nr) 364 unsigned nr)
369{ 365{
370 uint8_t subkey[32]; 366 uint8_t subkey[32];
371 uint8_t subnonce[12]; 367 uint8_t subnonce[12];
372 368
373 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); 369 hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
374 memset(subnonce, 0, 4); 370 memset(subnonce, 0, 4);
375 memcpy(subnonce + 4, nonce + 16, 8); 371 memcpy(subnonce + 4, nonce + 16, 8);
376 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); 372 chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
377} 373}

cvs diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon.h (switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/chacha_neon.h 2020/07/27 20:51:29 1.2
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.h 2020/07/28 20:08:48 1.3
@@ -1,85 +1,83 @@ @@ -1,85 +1,83 @@
1/* $NetBSD: chacha_neon.h,v 1.2 2020/07/27 20:51:29 riastradh Exp $ */ 1/* $NetBSD: chacha_neon.h,v 1.3 2020/07/28 20:08:48 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H 29#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H
30#define _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H 30#define _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H
31 31
32#include <sys/types.h> 32#include <sys/types.h>
33 33
34#include <crypto/chacha/chacha_impl.h> 34#include <crypto/chacha/chacha_impl.h>
35 35
36void chacha_core_neon(uint8_t[restrict static 64], 36void chacha_core_neon(uint8_t[restrict static 64],
37 const uint8_t[static 16], 37 const uint8_t[static 16],
38 const uint8_t[static 32], 38 const uint8_t[static 32],
39 const uint8_t[static 16], 39 const uint8_t[static 16],
40 unsigned); 40 unsigned);
41void hchacha_neon(uint8_t[restrict static 32], 41void hchacha_neon(uint8_t[restrict static 32],
42 const uint8_t[static 16], 42 const uint8_t[static 16],
43 const uint8_t[static 32], 43 const uint8_t[static 32],
44 const uint8_t[static 16], 44 const uint8_t[static 16],
45 unsigned); 45 unsigned);
46void chacha_stream_neon(uint8_t *restrict, size_t, 46void chacha_stream_neon(uint8_t *restrict, size_t,
47 uint32_t, 47 uint32_t,
48 const uint8_t[static 12], 48 const uint8_t[static 12],
49 const uint8_t[static 32], 49 const uint8_t[static 32],
50 unsigned); 50 unsigned);
51void chacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, 51void chacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t,
52 uint32_t, 52 uint32_t,
53 const uint8_t[static 12], 53 const uint8_t[static 12],
54 const uint8_t[static 32], 54 const uint8_t[static 32],
55 unsigned); 55 unsigned);
56void xchacha_stream_neon(uint8_t *restrict, size_t, 56void xchacha_stream_neon(uint8_t *restrict, size_t,
57 uint32_t, 57 uint32_t,
58 const uint8_t[static 24], 58 const uint8_t[static 24],
59 const uint8_t[static 32], 59 const uint8_t[static 32],
60 unsigned); 60 unsigned);
61void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, 61void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t,
62 uint32_t, 62 uint32_t,
63 const uint8_t[static 24], 63 const uint8_t[static 24],
64 const uint8_t[static 32], 64 const uint8_t[static 32],
65 unsigned); 65 unsigned);
66 66
67#ifdef __aarch64__ 67/* Assembly helpers */
68/* Assembly helpers -- aarch64 only for now */ 
69void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t, 68void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t,
70 const uint8_t[static 12], 69 const uint8_t[static 12],
71 const uint8_t[static 32], 70 const uint8_t[static 32],
72 const uint8_t[static 16], 71 const uint8_t[static 16],
73 unsigned); 72 unsigned);
74void chacha_stream_xor256_neon(uint8_t[restrict static 256], 73void chacha_stream_xor256_neon(uint8_t[restrict static 256],
75 const uint8_t[static 256], 74 const uint8_t[static 256],
76 uint32_t, 75 uint32_t,
77 const uint8_t[static 12], 76 const uint8_t[static 12],
78 const uint8_t[static 32], 77 const uint8_t[static 32],
79 const uint8_t[static 16], 78 const uint8_t[static 16],
80 unsigned); 79 unsigned);
81#endif /* __aarch64__ */ 
82 80
83extern const struct chacha_impl chacha_neon_impl; 81extern const struct chacha_impl chacha_neon_impl;
84 82
85#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */ 83#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */

cvs diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm (switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/files.chacha_arm 2020/07/27 20:51:29 1.2
+++ src/sys/crypto/chacha/arch/arm/files.chacha_arm 2020/07/28 20:08:48 1.3
@@ -1,11 +1,12 @@ @@ -1,11 +1,12 @@
1# $NetBSD: files.chacha_arm,v 1.2 2020/07/27 20:51:29 riastradh Exp $ 1# $NetBSD: files.chacha_arm,v 1.3 2020/07/28 20:08:48 riastradh Exp $
2 2
3ifdef aarch64 3ifdef aarch64
4makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" 4makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a"
5else 5else
6makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon" 6makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon"
7endif 7endif
8 8
9file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64) 9file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64)
 10file crypto/chacha/arch/arm/chacha_neon_32.S chacha & cpu_cortex & !aarch64
10file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64 11file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64
11file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64) 12file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64)

File Added: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
/*	$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $	*/

/*-
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <machine/asm.h>

RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")

	.fpu	neon

/*
 * ChaCha round, split up so we can interleave the quarterrounds on
 * independent rows/diagonals to maximize pipeline efficiency, with
 * spills to deal with the scarcity of registers.  Reference:
 *
 *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
 *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
 *	https://cr.yp.to/papers.html#chacha
 *
 *	a += b; d ^= a; d <<<= 16;
 *	c += d; b ^= c; b <<<= 12;
 *	a += b; d ^= a; d <<<= 8;
 *	c += d; b ^= c; b <<<= 7;
 *
 * The rotations are implemented with:
 *	<<< 16		VREV32.16 for 16,
 *	<<< 12		VSHL/VSRI/VORR (shift left, shift right and insert, OR)
 *	<<< 8		TBL (general permutation; rot8 below stored in r)
 *	<<< 7		VSHL/VSRI/VORR
 */

.macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
	vld1.32		{\c2-\c3}, [fp, :256]
.endm

.macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
	/* a += b; d ^= a; d <<<= 16 */
	vadd.u32	\a0, \a0, \b0
	vadd.u32	\a1, \a1, \b1
	vadd.u32	\a2, \a2, \b2
	vadd.u32	\a3, \a3, \b3

	veor		\d0, \d0, \a0
	veor		\d1, \d1, \a1
	veor		\d2, \d2, \a2
	veor		\d3, \d3, \a3

	vrev32.16	\d0, \d0
	vrev32.16	\d1, \d1
	vrev32.16	\d2, \d2
	vrev32.16	\d3, \d3

	/* c += d; b ^= c; b <<<= 12 */
	vadd.u32	\c0, \c0, \d0
	vadd.u32	\c1, \c1, \d1
	vadd.u32	\c2, \c2, \d2
	vadd.u32	\c3, \c3, \d3

	vst1.32		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */

	veor		\c0, \b0, \c0
	veor		\c1, \b1, \c1
	vshl.u32	\b0, \c0, #12
	vshl.u32	\b1, \c1, #12
	vsri.u32	\b0, \c0, #(32 - 12)
	vsri.u32	\b1, \c1, #(32 - 12)

	veor		\c0, \b2, \c2
	veor		\c1, \b3, \c3
	vshl.u32	\b2, \c0, #12
	vshl.u32	\b3, \c1, #12
	vsri.u32	\b2, \c0, #(32 - 12)
	vsri.u32	\b3, \c1, #(32 - 12)

	vld1.8		{\c0l}, [r7, :64]	/* load rot8 table */

	/* a += b; d ^= a; d <<<= 8 */
	vadd.u32	\a0, \a0, \b0
	vadd.u32	\a1, \a1, \b1
	vadd.u32	\a2, \a2, \b2
	vadd.u32	\a3, \a3, \b3

	veor		\d0, \d0, \a0
	veor		\d1, \d1, \a1
	veor		\d2, \d2, \a2
	veor		\d3, \d3, \a3

	vtbl.8		\d0l, {\d0l}, \c0l	/* <<< 8 */
	vtbl.8		\d0h, {\d0h}, \c0l
	vtbl.8		\d1l, {\d1l}, \c0l
	vtbl.8		\d1h, {\d1h}, \c0l
	vtbl.8		\d2l, {\d2l}, \c0l
	vtbl.8		\d2h, {\d2h}, \c0l
	vtbl.8		\d3l, {\d3l}, \c0l
	vtbl.8		\d3h, {\d3h}, \c0l

	vld1.32		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */

	/* c += d; b ^= c; b <<<= 7 */
	vadd.u32	\c2, \c2, \d2
	vadd.u32	\c3, \c3, \d3
	vadd.u32	\c0, \c0, \d0
	vadd.u32	\c1, \c1, \d1

	vst1.32		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */

	veor		\c2, \b2, \c2
	veor		\c3, \b3, \c3
	vshl.u32	\b2, \c2, #7
	vshl.u32	\b3, \c3, #7
	vsri.u32	\b2, \c2, #(32 - 7)
	vsri.u32	\b3, \c3, #(32 - 7)

	veor		\c2, \b0, \c0
	veor		\c3, \b1, \c1
	vshl.u32	\b0, \c2, #7
	vshl.u32	\b1, \c3, #7
	vsri.u32	\b0, \c2, #(32 - 7)
	vsri.u32	\b1, \c3, #(32 - 7)
.endm

#if _BYTE_ORDER == _LITTLE_ENDIAN
#define	HTOLE32(x)
#define	LE32TOH(x)
#elif _BYTE_ORDER == _BIG_ENDIAN
#define	HTOLE32(x)	vrev32.8	x, x
#define	LE32TOH(x)	vrev32.8	x, x
#endif

	.text
	.p2align 2
.Lconstants_addr:
	.long	.Lconstants - .

/*
 * chacha_stream256_neon(uint8_t s[256]@r0,
 *     uint32_t blkno@r1,
 *     const uint8_t nonce[12]@r2,
 *     const uint8_t key[32]@r3,
 *     const uint8_t const[16]@sp[0],
 *     unsigned nr@sp[4])
 */
ENTRY(chacha_stream256_neon)
	/* save callee-saves registers */
	push	{r4, r5, r6, r7, r8, r10, fp, lr}
	vpush	{d8-d15}

	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
	ldr	r7, .Lconstants_addr
	adr	r6, .Lconstants_addr

	/* reserve space for two 128-bit/16-byte q registers */
	sub	fp, sp, #0x20
	bic	fp, fp, #0x1f	/* align */

	/* get parameters */
	add	ip, sp, #96
	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */

	vld1.32	{q12}, [r4]	/* q12 := constant */
	vld1.32	{q13-q14}, [r3]	/* q13-q14 := key */
	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */

	vdup.32	q0, d24[0]	/* q0-q3 := constant */
	vdup.32	q1, d24[1]
	vdup.32	q2, d25[0]
	vdup.32	q3, d25[1]
	vdup.32	q12, r1		/* q12 := (blkno, blkno, blkno, blkno) */
	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
	vdup.32	q5, d26[1]
	vdup.32	q6, d27[0]
	vdup.32	q7, d27[1]
	vdup.32	q8, d28[0]
	vdup.32	q9, d28[1]
	vdup.32	q10, d29[0]
	vdup.32	q11, d29[1]
	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
	vdup.32	q13, r6		/* q13-q15 := nonce */
	vdup.32	q14, r8
	vdup.32	q15, r10

	HTOLE32(q0)
	HTOLE32(q1)
	HTOLE32(q2)
	HTOLE32(q3)
	HTOLE32(q4)
	HTOLE32(q5)
	HTOLE32(q6)
	HTOLE32(q7)
	HTOLE32(q8)
	HTOLE32(q9)
	HTOLE32(q10)
	HTOLE32(q11)
	HTOLE32(q12)
	HTOLE32(q13)
	HTOLE32(q14)
	HTOLE32(q15)

	b	2f

	_ALIGN_TEXT
1:	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
2:	subs	r5, r5, #2
	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
			d16, d24,d25, d26,d27, d28,d29, d30,d31
	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
			d20, d30,d31, d24,d25, d26,d27, d28,d29
	bne	1b

	/*
	 * q8-q9 are free / saved on the stack.  We have:
	 *
	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
	 *	...
	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
	 *
	 * where xi[j] is the jth word of the ith 16-word block.  Zip
	 * consecutive pairs with vzip.32, and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
	 *	...
	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
	 *
	 * As 64-bit d registers, this is:
	 *
	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
	 *	...
	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
	 *
	 * Swap d1<->d4, d3<->d6, ..., and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	...
	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
	 */

	sub	r7, r7, #0x10
	vdup.32	q8, r1		/* q8 := (blkno, blkno, blkno, blkno) */
	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */

	vzip.32	q0, q1
	vzip.32	q2, q3
	vzip.32	q4, q5
	vzip.32	q6, q7

	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q9}, [r4]	/* q9 := constant */
	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q8}, [r3]!	/* q8 := key[0:16) */

	vswp	d1, d4
	vswp	d9, d12
	vswp	d3, d6
	vswp	d11, d14

	/*
	 * At this point, the blocks are:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
	 *
	 * The first two rows to write out are q0 = x0[0:4) and q4 =
	 * x0[4:8).  If we first swap q1 and q4, then once we've
	 * written them out we free up consecutive registers q0-q1 for
	 * store-multiple.
	 */

	vswp	q1, q4

	vadd.u32 q0, q0, q9
	vadd.u32 q4, q4, q9
	vadd.u32 q2, q2, q9
	vadd.u32 q3, q3, q9

	vadd.u32 q1, q1, q8
	vadd.u32 q5, q5, q8
	vadd.u32 q6, q6, q8
	vadd.u32 q7, q7, q8

	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */

	LE32TOH(q0)
	LE32TOH(q1)
	LE32TOH(q2)
	LE32TOH(q3)
	LE32TOH(q4)
	LE32TOH(q5)
	LE32TOH(q6)
	LE32TOH(q7)

	vst1.32	{q0-q1}, [r0]!
	vld1.32	{q0}, [r3]	/* q0 := key[16:32) */
	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
	vmov	d2, r3, r6
	vmov	d3, r8, r10

	vzip.32	q8, q9
	vzip.32	q10, q11
	vzip.32	q12, q13
	vzip.32	q14, q15

	vswp	d17, d20
	vswp	d25, d28
	vswp	d19, d22
	vswp	d27, d30

	vadd.u32 q8, q8, q0
	vadd.u32 q9, q9, q0
	vadd.u32 q10, q10, q0
	vadd.u32 q11, q11, q0

	vadd.u32 q12, q12, q1
	vadd.u32 q13, q13, q1
	vadd.u32 q14, q14, q1
	vadd.u32 q15, q15, q1

	LE32TOH(q8)
	LE32TOH(q9)
	LE32TOH(q10)
	LE32TOH(q11)
	LE32TOH(q12)
	LE32TOH(q13)
	LE32TOH(q14)
	LE32TOH(q15)

	/* prepare to zero temporary space on stack */
	vmov.i32 q0, #0
	vmov.i32 q1, #0

	/* vst1.32	{q0}, [r0]! */
	/* vst1.32	{q1}, [r0]! */	/* (was q4 before vswp) */
	vst1.32	{q8}, [r0]!
	vst1.32	{q12}, [r0]!
	vst1.32	{q2}, [r0]!
	vst1.32	{q6}, [r0]!
	vst1.32	{q10}, [r0]!
	vst1.32	{q14}, [r0]!
	vst1.32	{q4}, [r0]!	/* (was q1 before vswp) */
	vst1.32	{q5}, [r0]!
	vst1.32	{q9}, [r0]!
	vst1.32	{q13}, [r0]!
	vst1.32 {q3}, [r0]!
	vst1.32 {q7}, [r0]!
	vst1.32 {q11}, [r0]!
	vst1.32 {q15}, [r0]

	/* zero temporary space on the stack */
	vst1.8	{q0-q1}, [fp, :256]

	/* restore callee-saves registers and stack */
	vpop	{d8-d15}
	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
	bx	lr
END(chacha_stream256_neon)

/*
 * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
 *     uint32_t blkno@r2,
 *     const uint8_t nonce[12]@r3,
 *     const uint8_t key[32]@sp[0],
 *     const uint8_t const[16]@sp[4],
 *     unsigned nr@sp[8])
 */
ENTRY(chacha_stream_xor256_neon)
	/* save callee-saves registers */
	push	{r4, r5, r6, r7, r8, r10, fp, lr}
	vpush	{d8-d15}

	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
	ldr	r7, .Lconstants_addr
	adr	r6, .Lconstants_addr

	/* reserve space for two 128-bit/16-byte q registers */
	sub	fp, sp, #0x20
	bic	fp, fp, #0x1f	/* align */

	/* get parameters */
	add	ip, sp, #96
	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */

	vld1.32	{q12}, [r5]	/* q12 := constant */
	vld1.32	{q13-q14}, [r4]	/* q13-q14 := key */
	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */

	vdup.32	q0, d24[0]	/* q0-q3 := constant */
	vdup.32	q1, d24[1]
	vdup.32	q2, d25[0]
	vdup.32	q3, d25[1]
	vdup.32	q12, r2		/* q12 := (blkno, blkno, blkno, blkno) */
	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
	vdup.32	q5, d26[1]
	vdup.32	q6, d27[0]
	vdup.32	q7, d27[1]
	vdup.32	q8, d28[0]
	vdup.32	q9, d28[1]
	vdup.32	q10, d29[0]
	vdup.32	q11, d29[1]
	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
	vdup.32	q13, r6		/* q13-q15 := nonce */
	vdup.32	q14, r8
	vdup.32	q15, r10

	HTOLE32(q0)
	HTOLE32(q1)
	HTOLE32(q2)
	HTOLE32(q3)
	HTOLE32(q4)
	HTOLE32(q5)
	HTOLE32(q6)
	HTOLE32(q7)
	HTOLE32(q8)
	HTOLE32(q9)
	HTOLE32(q10)
	HTOLE32(q11)
	HTOLE32(q12)
	HTOLE32(q13)
	HTOLE32(q14)
	HTOLE32(q15)

	b	2f

	_ALIGN_TEXT
1:	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
2:	subs	ip, ip, #2
	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
			d16, d24,d25, d26,d27, d28,d29, d30,d31
	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
			d20, d30,d31, d24,d25, d26,d27, d28,d29
	bne	1b

	/*
	 * q8-q9 are free / saved on the stack.  Now for the real fun:
	 * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
	 * {0,1,2,...,15}.  The twist is that the p[i] and the y[i] are
	 * transposed from one another, and the x[i] are in general
	 * registers and memory.  So we have:
	 *
	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
	 *	...
	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
	 *
	 * where xi[j] is the jth word of the ith 16-word block.  Zip
	 * consecutive pairs with vzip.32, and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
	 *	...
	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
	 *
	 * As 64-bit d registers, this is:
	 *
	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
	 *	...
	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
	 *
	 * Swap d1<->d4, d3<->d6, ..., and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	...
	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
	 */

	sub	r7, r7, #0x10
	vdup.32	q8, r2		/* q8 := (blkno, blkno, blkno, blkno) */
	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */

	vzip.32	q0, q1
	vzip.32	q2, q3
	vzip.32	q4, q5
	vzip.32	q6, q7

	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q9}, [r5]	/* q9 := constant */
	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q8}, [r4]!	/* q8 := key[0:16) */

	vswp	d1, d4
	vswp	d9, d12
	vswp	d3, d6
	vswp	d11, d14

	/*
	 * At this point, the blocks are:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
	 *
	 * The first two rows to write out are q0 = x0[0:4) and q4 =
	 * x0[4:8).  If we first swap q1 and q4, then once we've
	 * written them out we free up consecutive registers q0-q1 for
	 * store-multiple.
	 */

	vswp	q1, q4

	vadd.u32 q0, q0, q9
	vadd.u32 q4, q4, q9
	vadd.u32 q2, q2, q9
	vadd.u32 q3, q3, q9

	vadd.u32 q1, q1, q8
	vadd.u32 q5, q5, q8
	vadd.u32 q6, q6, q8
	vadd.u32 q7, q7, q8

	vld1.32 {q8-q9}, [r1]!	/* load plaintext bytes [0:32) */

	LE32TOH(q0)
	LE32TOH(q1)
	LE32TOH(q2)
	LE32TOH(q6)
	LE32TOH(q4)
	LE32TOH(q5)
	LE32TOH(q3)
	LE32TOH(q7)

	veor	q0, q0, q8	/* compute ciphertext bytes [0:32) */
	veor	q1, q1, q9

	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */

	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
	vld1.32	{q0}, [r4]	/* q0 := key[16:32) */
	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
	vmov	d2, r3, r6
	vmov	d3, r8, r10

	vzip.32	q8, q9
	vzip.32	q10, q11
	vzip.32	q12, q13
	vzip.32	q14, q15

	vswp	d17, d20
	vswp	d25, d28
	vswp	d19, d22
	vswp	d27, d30

	vswp	q9, q12		/* free up q9 earlier for consecutive q8-q9 */

	vadd.u32 q8, q8, q0
	vadd.u32 q12, q12, q0
	vadd.u32 q10, q10, q0
	vadd.u32 q11, q11, q0

	vadd.u32 q9, q9, q1
	vadd.u32 q13, q13, q1
	vadd.u32 q14, q14, q1
	vadd.u32 q15, q15, q1

	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */

	LE32TOH(q8)
	LE32TOH(q9)
	LE32TOH(q10)
	LE32TOH(q14)
	LE32TOH(q12)
	LE32TOH(q13)
	LE32TOH(q11)
	LE32TOH(q15)

	veor	q0, q0, q8	/* compute ciphertext bytes [32:64) */
	veor	q1, q1, q9

	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */

	veor	q2, q2, q8	/* compute ciphertext bytes [64:96) */
	veor	q6, q6, q9

	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
	vst1.32	{q2}, [r0]!	/* store ciphertext bytes [64:80) */

	veor	q10, q10, q0	/* compute ciphertext bytes [96:128) */
	veor	q14, q14, q1

	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
	vst1.32	{q6}, [r0]!	/* store ciphertext bytes [80:96) */

	veor	q4, q4, q8	/* compute ciphertext bytes [128:160) */
	veor	q5, q5, q9

	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
	vst1.32	{q10}, [r0]!	/* store ciphertext bytes [96:112) */

	veor	q12, q12, q0	/* compute ciphertext bytes [160:192) */
	veor	q13, q13, q1

	vld1.32	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
	vst1.32	{q14}, [r0]!	/* store ciphertext bytes [112:128) */

	veor	q8, q3, q8	/* compute ciphertext bytes [192:224) */
	veor	q9, q7, q9

	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [128:160) */
	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [160:192) */

	veor	q0, q11, q0	/* compute ciphertext bytes [224:256) */
	veor	q1, q15, q1

	vst1.32	{q8-q9}, [r0]!	/* store ciphertext bytes [192:224) */
	vst1.32	{q0-q1}, [r0]	/* store ciphertext bytes [224:256) */

	/* zero temporary space on the stack */
	vmov.i32 q0, #0
	vmov.i32 q1, #0
	vst1.8	{q0-q1}, [fp, :256]

	/* restore callee-saves registers and stack */
	vpop	{d8-d15}
	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
	bx	lr
END(chacha_stream_xor256_neon)

	.section .rodata
	.p2align 4
.Lconstants:

	.type	v0123,%object
v0123:
	.long	0, 1, 2, 3
END(v0123)

	.type	rot8,%object
rot8:
	.long	0x02010003, 0x06050407
END(rot8)

cvs diff -r1.2 -r1.3 src/tests/sys/crypto/chacha/Makefile (switch to unified diff)

--- src/tests/sys/crypto/chacha/Makefile 2020/07/27 20:51:29 1.2
+++ src/tests/sys/crypto/chacha/Makefile 2020/07/28 20:08:48 1.3
@@ -1,45 +1,47 @@ @@ -1,45 +1,47 @@
1# $NetBSD: Makefile,v 1.2 2020/07/27 20:51:29 riastradh Exp $ 1# $NetBSD: Makefile,v 1.3 2020/07/28 20:08:48 riastradh Exp $
2 2
3.include <bsd.own.mk> 3.include <bsd.own.mk>
4 4
5TESTSDIR= ${TESTSBASE}/sys/crypto/chacha 5TESTSDIR= ${TESTSBASE}/sys/crypto/chacha
6 6
7TESTS_C= t_chacha 7TESTS_C= t_chacha
8 8
9AFLAGS+= -D_LOCORE 9AFLAGS+= -D_LOCORE
10 10
11.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha 11.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha
12CPPFLAGS+= -I${NETBSDSRCDIR}/sys 12CPPFLAGS+= -I${NETBSDSRCDIR}/sys
13 13
14SRCS.t_chacha+= t_chacha.c 14SRCS.t_chacha+= t_chacha.c
15 15
16SRCS.t_chacha+= chacha_ref.c 16SRCS.t_chacha+= chacha_ref.c
17SRCS.t_chacha+= chacha_selftest.c 17SRCS.t_chacha+= chacha_selftest.c
18 18
19.if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*) 19.if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*)
20 20
21.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm 21.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm
22CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm 22CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm
23 23
24SRCS.t_chacha+= chacha_neon.c 24SRCS.t_chacha+= chacha_neon.c
25.if !empty(MACHINE_ARCH:Maarch64*) 25.if !empty(MACHINE_ARCH:Mearmv7*)
 26SRCS.t_chacha+= chacha_neon_32.S
 27.elif !empty(MACHINE_ARCH:Maarch64*)
26SRCS.t_chacha+= chacha_neon_64.S 28SRCS.t_chacha+= chacha_neon_64.S
27.endif 29.endif
28SRCS.t_chacha+= chacha_neon_impl.c 30SRCS.t_chacha+= chacha_neon_impl.c
29 31
30.endif # earmv7 or aarch64 32.endif # earmv7 or aarch64
31 33
32.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64" 34.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64"
33 35
34.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 36.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86
35CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 37CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86
36 38
37SRCS.t_chacha+= chacha_sse2.c 39SRCS.t_chacha+= chacha_sse2.c
38SRCS.t_chacha+= chacha_sse2_impl.c 40SRCS.t_chacha+= chacha_sse2_impl.c
39COPTS.chacha_sse2.c+= -msse -msse2 41COPTS.chacha_sse2.c+= -msse -msse2
40 42
41.endif # x86 43.endif # x86
42 44
43WARNS= 5 45WARNS= 5
44 46
45.include <bsd.test.mk> 47.include <bsd.test.mk>