Tue Jul 28 20:08:48 2020 UTC ()
Implement 4-way vectorization of ChaCha for armv7 NEON.
cgd performance is not as good as I was hoping (~4% improvement over
chacha_ref.c) but it should improve substantially more if we let the
cgd worker thread keep fpu state so we don't have to pay the cost of
isb and zero-the-fpu on every 512-byte cgd block.
(riastradh)
diff -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon.h
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm
diff -r0 -r1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
diff -r1.2 -r1.3 src/tests/sys/crypto/chacha/Makefile
--- src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/28 20:05:33 1.6
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/28 20:08:48 1.7
| @@ -1,377 +1,373 @@ | | | @@ -1,377 +1,373 @@ |
1 | /* $NetBSD: chacha_neon.c,v 1.6 2020/07/28 20:05:33 riastradh Exp $ */ | | 1 | /* $NetBSD: chacha_neon.c,v 1.7 2020/07/28 20:08:48 riastradh Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
15 | * | | 15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. | | 26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ | | 27 | */ |
28 | | | 28 | |
29 | #include <sys/types.h> | | 29 | #include <sys/types.h> |
30 | #include <sys/endian.h> | | 30 | #include <sys/endian.h> |
31 | | | 31 | |
32 | #include "arm_neon.h" | | 32 | #include "arm_neon.h" |
33 | #include "chacha_neon.h" | | 33 | #include "chacha_neon.h" |
34 | | | 34 | |
35 | static inline uint32x4_t | | 35 | static inline uint32x4_t |
36 | vrolq_n_u32(uint32x4_t x, uint8_t n) | | 36 | vrolq_n_u32(uint32x4_t x, uint8_t n) |
37 | { | | 37 | { |
38 | | | 38 | |
39 | /* | | 39 | /* |
40 | * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in | | 40 | * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in |
41 | * practice it hurts performance at least on Cortex-A8. | | 41 | * practice it hurts performance at least on Cortex-A8. |
42 | */ | | 42 | */ |
43 | #if 1 | | 43 | #if 1 |
44 | return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); | | 44 | return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); |
45 | #else | | 45 | #else |
46 | return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n); | | 46 | return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n); |
47 | #endif | | 47 | #endif |
48 | } | | 48 | } |
49 | | | 49 | |
50 | static inline uint32x4_t | | 50 | static inline uint32x4_t |
51 | vhtole_u32(uint32x4_t x) | | 51 | vhtole_u32(uint32x4_t x) |
52 | { | | 52 | { |
53 | #if _BYTE_ORDER == _LITTLE_ENDIAN | | 53 | #if _BYTE_ORDER == _LITTLE_ENDIAN |
54 | return x; | | 54 | return x; |
55 | #elif _BYTE_ORDER == _BIG_ENDIAN | | 55 | #elif _BYTE_ORDER == _BIG_ENDIAN |
56 | return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); | | 56 | return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); |
57 | #endif | | 57 | #endif |
58 | } | | 58 | } |
59 | | | 59 | |
60 | static inline uint32x4_t | | 60 | static inline uint32x4_t |
61 | vletoh_u32(uint32x4_t x) | | 61 | vletoh_u32(uint32x4_t x) |
62 | { | | 62 | { |
63 | #if _BYTE_ORDER == _LITTLE_ENDIAN | | 63 | #if _BYTE_ORDER == _LITTLE_ENDIAN |
64 | return x; | | 64 | return x; |
65 | #elif _BYTE_ORDER == _BIG_ENDIAN | | 65 | #elif _BYTE_ORDER == _BIG_ENDIAN |
66 | return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); | | 66 | return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); |
67 | #endif | | 67 | #endif |
68 | } | | 68 | } |
69 | | | 69 | |
70 | static inline uint32x4_t | | 70 | static inline uint32x4_t |
71 | rol16(uint32x4_t x) | | 71 | rol16(uint32x4_t x) |
72 | { | | 72 | { |
73 | uint16x8_t y16, x16 = vreinterpretq_u16_u32(x); | | 73 | uint16x8_t y16, x16 = vreinterpretq_u16_u32(x); |
74 | | | 74 | |
75 | y16 = vrev32q_u16(x16); | | 75 | y16 = vrev32q_u16(x16); |
76 | | | 76 | |
77 | return vreinterpretq_u32_u16(y16); | | 77 | return vreinterpretq_u32_u16(y16); |
78 | } | | 78 | } |
79 | | | 79 | |
80 | static inline uint32x4_t | | 80 | static inline uint32x4_t |
81 | rol12(uint32x4_t x) | | 81 | rol12(uint32x4_t x) |
82 | { | | 82 | { |
83 | | | 83 | |
84 | return vrolq_n_u32(x, 12); | | 84 | return vrolq_n_u32(x, 12); |
85 | } | | 85 | } |
86 | | | 86 | |
87 | static inline uint32x4_t | | 87 | static inline uint32x4_t |
88 | rol8(uint32x4_t x) | | 88 | rol8(uint32x4_t x) |
89 | { | | 89 | { |
90 | #if defined(__aarch64__) | | 90 | #if defined(__aarch64__) |
91 | static const uint8x16_t rol8_tab = { | | 91 | static const uint8x16_t rol8_tab = { |
92 | 3, 0, 1, 2, 7, 4, 5, 6, | | 92 | 3, 0, 1, 2, 7, 4, 5, 6, |
93 | 11, 8, 9,10, 15,12,13,14, | | 93 | 11, 8, 9,10, 15,12,13,14, |
94 | }; | | 94 | }; |
95 | uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); | | 95 | uint8x16_t y8, x8 = vreinterpretq_u8_u32(x); |
96 | | | 96 | |
97 | y8 = vqtbl1q_u8(x8, rol8_tab); | | 97 | y8 = vqtbl1q_u8(x8, rol8_tab); |
98 | | | 98 | |
99 | return vreinterpretq_u32_u8(y8); | | 99 | return vreinterpretq_u32_u8(y8); |
100 | #elif 0 | | 100 | #elif 0 |
101 | /* | | 101 | /* |
102 | * GCC does a lousy job with this, spilling two 64-bit vector | | 102 | * GCC does a lousy job with this, spilling two 64-bit vector |
103 | * registers to the stack every time. There should be plenty | | 103 | * registers to the stack every time. There should be plenty |
104 | * of vector registers free, requiring no spills at all, and | | 104 | * of vector registers free, requiring no spills at all, and |
105 | * GCC should be able to hoist the load of rol8_tab out of any | | 105 | * GCC should be able to hoist the load of rol8_tab out of any |
106 | * loops, but it doesn't and so attempting to use VTBL hurts | | 106 | * loops, but it doesn't and so attempting to use VTBL hurts |
107 | * more than it helps. | | 107 | * more than it helps. |
108 | */ | | 108 | */ |
109 | static const uint8x8_t rol8_tab = { | | 109 | static const uint8x8_t rol8_tab = { |
110 | 3, 0, 1, 2, 7, 4, 5, 6, | | 110 | 3, 0, 1, 2, 7, 4, 5, 6, |
111 | }; | | 111 | }; |
112 | | | 112 | |
113 | uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); | | 113 | uint64x2_t y64, x64 = vreinterpretq_u64_u32(x); |
114 | | | 114 | |
115 | y64 = (uint64x2_t) { | | 115 | y64 = (uint64x2_t) { |
116 | (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab), | | 116 | (uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab), |
117 | (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab), | | 117 | (uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab), |
118 | }; | | 118 | }; |
119 | | | 119 | |
120 | return vreinterpretq_u32_u64(y64); | | 120 | return vreinterpretq_u32_u64(y64); |
121 | #else | | 121 | #else |
122 | return vrolq_n_u32(x, 8); | | 122 | return vrolq_n_u32(x, 8); |
123 | #endif | | 123 | #endif |
124 | } | | 124 | } |
125 | | | 125 | |
126 | static inline uint32x4_t | | 126 | static inline uint32x4_t |
127 | rol7(uint32x4_t x) | | 127 | rol7(uint32x4_t x) |
128 | { | | 128 | { |
129 | | | 129 | |
130 | return vrolq_n_u32(x, 7); | | 130 | return vrolq_n_u32(x, 7); |
131 | } | | 131 | } |
132 | | | 132 | |
133 | static inline void | | 133 | static inline void |
134 | chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, | | 134 | chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3, |
135 | unsigned nr) | | 135 | unsigned nr) |
136 | { | | 136 | { |
137 | uint32x4_t r0, r1, r2, r3; | | 137 | uint32x4_t r0, r1, r2, r3; |
138 | uint32x4_t c0, c1, c2, c3; | | 138 | uint32x4_t c0, c1, c2, c3; |
139 | | | 139 | |
140 | r0 = *p0; | | 140 | r0 = *p0; |
141 | r1 = *p1; | | 141 | r1 = *p1; |
142 | r2 = *p2; | | 142 | r2 = *p2; |
143 | r3 = *p3; | | 143 | r3 = *p3; |
144 | | | 144 | |
145 | for (; nr > 0; nr -= 2) { | | 145 | for (; nr > 0; nr -= 2) { |
146 | r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3); | | 146 | r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3); |
147 | r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1); | | 147 | r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1); |
148 | r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3); | | 148 | r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3); |
149 | r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1); | | 149 | r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1); |
150 | | | 150 | |
151 | c0 = r0; | | 151 | c0 = r0; |
152 | c1 = vextq_u32(r1, r1, 1); | | 152 | c1 = vextq_u32(r1, r1, 1); |
153 | c2 = vextq_u32(r2, r2, 2); | | 153 | c2 = vextq_u32(r2, r2, 2); |
154 | c3 = vextq_u32(r3, r3, 3); | | 154 | c3 = vextq_u32(r3, r3, 3); |
155 | | | 155 | |
156 | c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3); | | 156 | c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3); |
157 | c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1); | | 157 | c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1); |
158 | c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3); | | 158 | c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3); |
159 | c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1); | | 159 | c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1); |
160 | | | 160 | |
161 | r0 = c0; | | 161 | r0 = c0; |
162 | r1 = vextq_u32(c1, c1, 3); | | 162 | r1 = vextq_u32(c1, c1, 3); |
163 | r2 = vextq_u32(c2, c2, 2); | | 163 | r2 = vextq_u32(c2, c2, 2); |
164 | r3 = vextq_u32(c3, c3, 1); | | 164 | r3 = vextq_u32(c3, c3, 1); |
165 | } | | 165 | } |
166 | | | 166 | |
167 | *p0 = r0; | | 167 | *p0 = r0; |
168 | *p1 = r1; | | 168 | *p1 = r1; |
169 | *p2 = r2; | | 169 | *p2 = r2; |
170 | *p3 = r3; | | 170 | *p3 = r3; |
171 | } | | 171 | } |
172 | | | 172 | |
173 | void | | 173 | void |
174 | chacha_core_neon(uint8_t out[restrict static 64], | | 174 | chacha_core_neon(uint8_t out[restrict static 64], |
175 | const uint8_t in[static 16], | | 175 | const uint8_t in[static 16], |
176 | const uint8_t k[static 32], | | 176 | const uint8_t k[static 32], |
177 | const uint8_t c[static 16], | | 177 | const uint8_t c[static 16], |
178 | unsigned nr) | | 178 | unsigned nr) |
179 | { | | 179 | { |
180 | uint32x4_t in0, in1, in2, in3; | | 180 | uint32x4_t in0, in1, in2, in3; |
181 | uint32x4_t r0, r1, r2, r3; | | 181 | uint32x4_t r0, r1, r2, r3; |
182 | | | 182 | |
183 | r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); | | 183 | r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); |
184 | r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); | | 184 | r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); |
185 | r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); | | 185 | r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); |
186 | r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); | | 186 | r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); |
187 | | | 187 | |
188 | chacha_permute(&r0, &r1, &r2, &r3, nr); | | 188 | chacha_permute(&r0, &r1, &r2, &r3, nr); |
189 | | | 189 | |
190 | vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); | | 190 | vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0))); |
191 | vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); | | 191 | vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1))); |
192 | vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); | | 192 | vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2))); |
193 | vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); | | 193 | vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3))); |
194 | } | | 194 | } |
195 | | | 195 | |
196 | void | | 196 | void |
197 | hchacha_neon(uint8_t out[restrict static 32], | | 197 | hchacha_neon(uint8_t out[restrict static 32], |
198 | const uint8_t in[static 16], | | 198 | const uint8_t in[static 16], |
199 | const uint8_t k[static 32], | | 199 | const uint8_t k[static 32], |
200 | const uint8_t c[static 16], | | 200 | const uint8_t c[static 16], |
201 | unsigned nr) | | 201 | unsigned nr) |
202 | { | | 202 | { |
203 | uint32x4_t r0, r1, r2, r3; | | 203 | uint32x4_t r0, r1, r2, r3; |
204 | | | 204 | |
205 | r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); | | 205 | r0 = vletoh_u32(vld1q_u32((const uint32_t *)c)); |
206 | r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); | | 206 | r1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); |
207 | r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); | | 207 | r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); |
208 | r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); | | 208 | r3 = vletoh_u32(vld1q_u32((const uint32_t *)in)); |
209 | | | 209 | |
210 | chacha_permute(&r0, &r1, &r2, &r3, nr); | | 210 | chacha_permute(&r0, &r1, &r2, &r3, nr); |
211 | | | 211 | |
212 | vst1q_u32((uint32_t *)out + 0, r0); | | 212 | vst1q_u32((uint32_t *)out + 0, r0); |
213 | vst1q_u32((uint32_t *)out + 4, r3); | | 213 | vst1q_u32((uint32_t *)out + 4, r3); |
214 | } | | 214 | } |
215 | | | 215 | |
216 | void | | 216 | void |
217 | chacha_stream_neon(uint8_t *restrict s, size_t n, | | 217 | chacha_stream_neon(uint8_t *restrict s, size_t n, |
218 | uint32_t blkno, | | 218 | uint32_t blkno, |
219 | const uint8_t nonce[static 12], | | 219 | const uint8_t nonce[static 12], |
220 | const uint8_t k[static 32], | | 220 | const uint8_t k[static 32], |
221 | unsigned nr) | | 221 | unsigned nr) |
222 | { | | 222 | { |
223 | | | 223 | |
224 | #ifdef __aarch64__ | | | |
225 | for (; n >= 256; s += 256, n -= 256, blkno += 4) | | 224 | for (; n >= 256; s += 256, n -= 256, blkno += 4) |
226 | chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); | | 225 | chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); |
227 | #endif | | | |
228 | | | 226 | |
229 | if (n) { | | 227 | if (n) { |
230 | const uint32x4_t blkno_inc = {1,0,0,0}; | | 228 | const uint32x4_t blkno_inc = {1,0,0,0}; |
231 | uint32x4_t in0, in1, in2, in3; | | 229 | uint32x4_t in0, in1, in2, in3; |
232 | uint32x4_t r0, r1, r2, r3; | | 230 | uint32x4_t r0, r1, r2, r3; |
233 | | | 231 | |
234 | in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); | | 232 | in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); |
235 | in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); | | 233 | in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); |
236 | in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); | | 234 | in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); |
237 | in3 = (uint32x4_t) { | | 235 | in3 = (uint32x4_t) { |
238 | blkno, | | 236 | blkno, |
239 | le32dec(nonce), | | 237 | le32dec(nonce), |
240 | le32dec(nonce + 4), | | 238 | le32dec(nonce + 4), |
241 | le32dec(nonce + 8) | | 239 | le32dec(nonce + 8) |
242 | }; | | 240 | }; |
243 | | | 241 | |
244 | for (; n; s += 64, n -= 64) { | | 242 | for (; n; s += 64, n -= 64) { |
245 | r0 = in0; | | 243 | r0 = in0; |
246 | r1 = in1; | | 244 | r1 = in1; |
247 | r2 = in2; | | 245 | r2 = in2; |
248 | r3 = in3; | | 246 | r3 = in3; |
249 | chacha_permute(&r0, &r1, &r2, &r3, nr); | | 247 | chacha_permute(&r0, &r1, &r2, &r3, nr); |
250 | r0 = vhtole_u32(vaddq_u32(r0, in0)); | | 248 | r0 = vhtole_u32(vaddq_u32(r0, in0)); |
251 | r1 = vhtole_u32(vaddq_u32(r1, in1)); | | 249 | r1 = vhtole_u32(vaddq_u32(r1, in1)); |
252 | r2 = vhtole_u32(vaddq_u32(r2, in2)); | | 250 | r2 = vhtole_u32(vaddq_u32(r2, in2)); |
253 | r3 = vhtole_u32(vaddq_u32(r3, in3)); | | 251 | r3 = vhtole_u32(vaddq_u32(r3, in3)); |
254 | | | 252 | |
255 | if (n < 64) { | | 253 | if (n < 64) { |
256 | uint8_t buf[64] __aligned(16); | | 254 | uint8_t buf[64] __aligned(16); |
257 | | | 255 | |
258 | vst1q_u32((uint32_t *)buf + 4*0, r0); | | 256 | vst1q_u32((uint32_t *)buf + 4*0, r0); |
259 | vst1q_u32((uint32_t *)buf + 4*1, r1); | | 257 | vst1q_u32((uint32_t *)buf + 4*1, r1); |
260 | vst1q_u32((uint32_t *)buf + 4*2, r2); | | 258 | vst1q_u32((uint32_t *)buf + 4*2, r2); |
261 | vst1q_u32((uint32_t *)buf + 4*3, r3); | | 259 | vst1q_u32((uint32_t *)buf + 4*3, r3); |
262 | memcpy(s, buf, n); | | 260 | memcpy(s, buf, n); |
263 | | | 261 | |
264 | break; | | 262 | break; |
265 | } | | 263 | } |
266 | | | 264 | |
267 | vst1q_u32((uint32_t *)s + 4*0, r0); | | 265 | vst1q_u32((uint32_t *)s + 4*0, r0); |
268 | vst1q_u32((uint32_t *)s + 4*1, r1); | | 266 | vst1q_u32((uint32_t *)s + 4*1, r1); |
269 | vst1q_u32((uint32_t *)s + 4*2, r2); | | 267 | vst1q_u32((uint32_t *)s + 4*2, r2); |
270 | vst1q_u32((uint32_t *)s + 4*3, r3); | | 268 | vst1q_u32((uint32_t *)s + 4*3, r3); |
271 | in3 = vaddq_u32(in3, blkno_inc); | | 269 | in3 = vaddq_u32(in3, blkno_inc); |
272 | } | | 270 | } |
273 | } | | 271 | } |
274 | } | | 272 | } |
275 | | | 273 | |
276 | void | | 274 | void |
277 | chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, | | 275 | chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, |
278 | uint32_t blkno, | | 276 | uint32_t blkno, |
279 | const uint8_t nonce[static 12], | | 277 | const uint8_t nonce[static 12], |
280 | const uint8_t k[static 32], | | 278 | const uint8_t k[static 32], |
281 | unsigned nr) | | 279 | unsigned nr) |
282 | { | | 280 | { |
283 | | | 281 | |
284 | #ifdef __aarch64__ | | | |
285 | for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) | | 282 | for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) |
286 | chacha_stream_xor256_neon(s, p, blkno, nonce, k, | | 283 | chacha_stream_xor256_neon(s, p, blkno, nonce, k, |
287 | chacha_const32, nr); | | 284 | chacha_const32, nr); |
288 | #endif | | | |
289 | | | 285 | |
290 | if (n) { | | 286 | if (n) { |
291 | const uint32x4_t blkno_inc = {1,0,0,0}; | | 287 | const uint32x4_t blkno_inc = {1,0,0,0}; |
292 | uint32x4_t in0, in1, in2, in3; | | 288 | uint32x4_t in0, in1, in2, in3; |
293 | uint32x4_t r0, r1, r2, r3; | | 289 | uint32x4_t r0, r1, r2, r3; |
294 | | | 290 | |
295 | in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); | | 291 | in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); |
296 | in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); | | 292 | in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); |
297 | in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); | | 293 | in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); |
298 | in3 = (uint32x4_t) { | | 294 | in3 = (uint32x4_t) { |
299 | blkno, | | 295 | blkno, |
300 | le32dec(nonce), | | 296 | le32dec(nonce), |
301 | le32dec(nonce + 4), | | 297 | le32dec(nonce + 4), |
302 | le32dec(nonce + 8) | | 298 | le32dec(nonce + 8) |
303 | }; | | 299 | }; |
304 | | | 300 | |
305 | for (; n; s += 64, p += 64, n -= 64) { | | 301 | for (; n; s += 64, p += 64, n -= 64) { |
306 | r0 = in0; | | 302 | r0 = in0; |
307 | r1 = in1; | | 303 | r1 = in1; |
308 | r2 = in2; | | 304 | r2 = in2; |
309 | r3 = in3; | | 305 | r3 = in3; |
310 | chacha_permute(&r0, &r1, &r2, &r3, nr); | | 306 | chacha_permute(&r0, &r1, &r2, &r3, nr); |
311 | r0 = vhtole_u32(vaddq_u32(r0, in0)); | | 307 | r0 = vhtole_u32(vaddq_u32(r0, in0)); |
312 | r1 = vhtole_u32(vaddq_u32(r1, in1)); | | 308 | r1 = vhtole_u32(vaddq_u32(r1, in1)); |
313 | r2 = vhtole_u32(vaddq_u32(r2, in2)); | | 309 | r2 = vhtole_u32(vaddq_u32(r2, in2)); |
314 | r3 = vhtole_u32(vaddq_u32(r3, in3)); | | 310 | r3 = vhtole_u32(vaddq_u32(r3, in3)); |
315 | | | 311 | |
316 | if (n < 64) { | | 312 | if (n < 64) { |
317 | uint8_t buf[64] __aligned(16); | | 313 | uint8_t buf[64] __aligned(16); |
318 | unsigned i; | | 314 | unsigned i; |
319 | | | 315 | |
320 | vst1q_u32((uint32_t *)buf + 4*0, r0); | | 316 | vst1q_u32((uint32_t *)buf + 4*0, r0); |
321 | vst1q_u32((uint32_t *)buf + 4*1, r1); | | 317 | vst1q_u32((uint32_t *)buf + 4*1, r1); |
322 | vst1q_u32((uint32_t *)buf + 4*2, r2); | | 318 | vst1q_u32((uint32_t *)buf + 4*2, r2); |
323 | vst1q_u32((uint32_t *)buf + 4*3, r3); | | 319 | vst1q_u32((uint32_t *)buf + 4*3, r3); |
324 | | | 320 | |
325 | for (i = 0; i < n - n%4; i += 4) | | 321 | for (i = 0; i < n - n%4; i += 4) |
326 | le32enc(s + i, | | 322 | le32enc(s + i, |
327 | le32dec(p + i) ^ le32dec(buf + i)); | | 323 | le32dec(p + i) ^ le32dec(buf + i)); |
328 | for (; i < n; i++) | | 324 | for (; i < n; i++) |
329 | s[i] = p[i] ^ buf[i]; | | 325 | s[i] = p[i] ^ buf[i]; |
330 | | | 326 | |
331 | break; | | 327 | break; |
332 | } | | 328 | } |
333 | | | 329 | |
334 | r0 ^= vld1q_u32((const uint32_t *)p + 4*0); | | 330 | r0 ^= vld1q_u32((const uint32_t *)p + 4*0); |
335 | r1 ^= vld1q_u32((const uint32_t *)p + 4*1); | | 331 | r1 ^= vld1q_u32((const uint32_t *)p + 4*1); |
336 | r2 ^= vld1q_u32((const uint32_t *)p + 4*2); | | 332 | r2 ^= vld1q_u32((const uint32_t *)p + 4*2); |
337 | r3 ^= vld1q_u32((const uint32_t *)p + 4*3); | | 333 | r3 ^= vld1q_u32((const uint32_t *)p + 4*3); |
338 | vst1q_u32((uint32_t *)s + 4*0, r0); | | 334 | vst1q_u32((uint32_t *)s + 4*0, r0); |
339 | vst1q_u32((uint32_t *)s + 4*1, r1); | | 335 | vst1q_u32((uint32_t *)s + 4*1, r1); |
340 | vst1q_u32((uint32_t *)s + 4*2, r2); | | 336 | vst1q_u32((uint32_t *)s + 4*2, r2); |
341 | vst1q_u32((uint32_t *)s + 4*3, r3); | | 337 | vst1q_u32((uint32_t *)s + 4*3, r3); |
342 | in3 = vaddq_u32(in3, blkno_inc); | | 338 | in3 = vaddq_u32(in3, blkno_inc); |
343 | } | | 339 | } |
344 | } | | 340 | } |
345 | } | | 341 | } |
346 | | | 342 | |
347 | void | | 343 | void |
348 | xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, | | 344 | xchacha_stream_neon(uint8_t *restrict s, size_t nbytes, |
349 | uint32_t blkno, | | 345 | uint32_t blkno, |
350 | const uint8_t nonce[static 24], | | 346 | const uint8_t nonce[static 24], |
351 | const uint8_t k[static 32], | | 347 | const uint8_t k[static 32], |
352 | unsigned nr) | | 348 | unsigned nr) |
353 | { | | 349 | { |
354 | uint8_t subkey[32]; | | 350 | uint8_t subkey[32]; |
355 | uint8_t subnonce[12]; | | 351 | uint8_t subnonce[12]; |
356 | | | 352 | |
357 | hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); | | 353 | hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); |
358 | memset(subnonce, 0, 4); | | 354 | memset(subnonce, 0, 4); |
359 | memcpy(subnonce + 4, nonce + 16, 8); | | 355 | memcpy(subnonce + 4, nonce + 16, 8); |
360 | chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); | | 356 | chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr); |
361 | } | | 357 | } |
362 | | | 358 | |
363 | void | | 359 | void |
364 | xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, | | 360 | xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes, |
365 | uint32_t blkno, | | 361 | uint32_t blkno, |
366 | const uint8_t nonce[static 24], | | 362 | const uint8_t nonce[static 24], |
367 | const uint8_t k[static 32], | | 363 | const uint8_t k[static 32], |
368 | unsigned nr) | | 364 | unsigned nr) |
369 | { | | 365 | { |
370 | uint8_t subkey[32]; | | 366 | uint8_t subkey[32]; |
371 | uint8_t subnonce[12]; | | 367 | uint8_t subnonce[12]; |
372 | | | 368 | |
373 | hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); | | 369 | hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr); |
374 | memset(subnonce, 0, 4); | | 370 | memset(subnonce, 0, 4); |
375 | memcpy(subnonce + 4, nonce + 16, 8); | | 371 | memcpy(subnonce + 4, nonce + 16, 8); |
376 | chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); | | 372 | chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr); |
377 | } | | 373 | } |
--- src/sys/crypto/chacha/arch/arm/chacha_neon.h 2020/07/27 20:51:29 1.2
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.h 2020/07/28 20:08:48 1.3
| @@ -1,85 +1,83 @@ | | | @@ -1,85 +1,83 @@ |
1 | /* $NetBSD: chacha_neon.h,v 1.2 2020/07/27 20:51:29 riastradh Exp $ */ | | 1 | /* $NetBSD: chacha_neon.h,v 1.3 2020/07/28 20:08:48 riastradh Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
15 | * | | 15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. | | 26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ | | 27 | */ |
28 | | | 28 | |
29 | #ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H | | 29 | #ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H |
30 | #define _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H | | 30 | #define _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H |
31 | | | 31 | |
32 | #include <sys/types.h> | | 32 | #include <sys/types.h> |
33 | | | 33 | |
34 | #include <crypto/chacha/chacha_impl.h> | | 34 | #include <crypto/chacha/chacha_impl.h> |
35 | | | 35 | |
36 | void chacha_core_neon(uint8_t[restrict static 64], | | 36 | void chacha_core_neon(uint8_t[restrict static 64], |
37 | const uint8_t[static 16], | | 37 | const uint8_t[static 16], |
38 | const uint8_t[static 32], | | 38 | const uint8_t[static 32], |
39 | const uint8_t[static 16], | | 39 | const uint8_t[static 16], |
40 | unsigned); | | 40 | unsigned); |
41 | void hchacha_neon(uint8_t[restrict static 32], | | 41 | void hchacha_neon(uint8_t[restrict static 32], |
42 | const uint8_t[static 16], | | 42 | const uint8_t[static 16], |
43 | const uint8_t[static 32], | | 43 | const uint8_t[static 32], |
44 | const uint8_t[static 16], | | 44 | const uint8_t[static 16], |
45 | unsigned); | | 45 | unsigned); |
46 | void chacha_stream_neon(uint8_t *restrict, size_t, | | 46 | void chacha_stream_neon(uint8_t *restrict, size_t, |
47 | uint32_t, | | 47 | uint32_t, |
48 | const uint8_t[static 12], | | 48 | const uint8_t[static 12], |
49 | const uint8_t[static 32], | | 49 | const uint8_t[static 32], |
50 | unsigned); | | 50 | unsigned); |
51 | void chacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, | | 51 | void chacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, |
52 | uint32_t, | | 52 | uint32_t, |
53 | const uint8_t[static 12], | | 53 | const uint8_t[static 12], |
54 | const uint8_t[static 32], | | 54 | const uint8_t[static 32], |
55 | unsigned); | | 55 | unsigned); |
56 | void xchacha_stream_neon(uint8_t *restrict, size_t, | | 56 | void xchacha_stream_neon(uint8_t *restrict, size_t, |
57 | uint32_t, | | 57 | uint32_t, |
58 | const uint8_t[static 24], | | 58 | const uint8_t[static 24], |
59 | const uint8_t[static 32], | | 59 | const uint8_t[static 32], |
60 | unsigned); | | 60 | unsigned); |
61 | void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, | | 61 | void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, |
62 | uint32_t, | | 62 | uint32_t, |
63 | const uint8_t[static 24], | | 63 | const uint8_t[static 24], |
64 | const uint8_t[static 32], | | 64 | const uint8_t[static 32], |
65 | unsigned); | | 65 | unsigned); |
66 | | | 66 | |
67 | #ifdef __aarch64__ | | 67 | /* Assembly helpers */ |
68 | /* Assembly helpers -- aarch64 only for now */ | | | |
69 | void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t, | | 68 | void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t, |
70 | const uint8_t[static 12], | | 69 | const uint8_t[static 12], |
71 | const uint8_t[static 32], | | 70 | const uint8_t[static 32], |
72 | const uint8_t[static 16], | | 71 | const uint8_t[static 16], |
73 | unsigned); | | 72 | unsigned); |
74 | void chacha_stream_xor256_neon(uint8_t[restrict static 256], | | 73 | void chacha_stream_xor256_neon(uint8_t[restrict static 256], |
75 | const uint8_t[static 256], | | 74 | const uint8_t[static 256], |
76 | uint32_t, | | 75 | uint32_t, |
77 | const uint8_t[static 12], | | 76 | const uint8_t[static 12], |
78 | const uint8_t[static 32], | | 77 | const uint8_t[static 32], |
79 | const uint8_t[static 16], | | 78 | const uint8_t[static 16], |
80 | unsigned); | | 79 | unsigned); |
81 | #endif /* __aarch64__ */ | | | |
82 | | | 80 | |
83 | extern const struct chacha_impl chacha_neon_impl; | | 81 | extern const struct chacha_impl chacha_neon_impl; |
84 | | | 82 | |
85 | #endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */ | | 83 | #endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */ |
--- src/sys/crypto/chacha/arch/arm/files.chacha_arm 2020/07/27 20:51:29 1.2
+++ src/sys/crypto/chacha/arch/arm/files.chacha_arm 2020/07/28 20:08:48 1.3
| @@ -1,11 +1,12 @@ | | | @@ -1,11 +1,12 @@ |
1 | # $NetBSD: files.chacha_arm,v 1.2 2020/07/27 20:51:29 riastradh Exp $ | | 1 | # $NetBSD: files.chacha_arm,v 1.3 2020/07/28 20:08:48 riastradh Exp $ |
2 | | | 2 | |
3 | ifdef aarch64 | | 3 | ifdef aarch64 |
4 | makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" | | 4 | makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" |
5 | else | | 5 | else |
6 | makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon" | | 6 | makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon" |
7 | endif | | 7 | endif |
8 | | | 8 | |
9 | file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64) | | 9 | file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64) |
| | | 10 | file crypto/chacha/arch/arm/chacha_neon_32.S chacha & cpu_cortex & !aarch64 |
10 | file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64 | | 11 | file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64 |
11 | file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64) | | 12 | file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64) |
/* $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <machine/asm.h>
RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")
.fpu neon
/*
* ChaCha round, split up so we can interleave the quarterrounds on
* independent rows/diagonals to maximize pipeline efficiency, with
* spills to deal with the scarcity of registers. Reference:
*
* Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
* Record of the State of the Art in Stream Ciphers -- SASC 2008.
* https://cr.yp.to/papers.html#chacha
*
* a += b; d ^= a; d <<<= 16;
* c += d; b ^= c; b <<<= 12;
* a += b; d ^= a; d <<<= 8;
* c += d; b ^= c; b <<<= 7;
*
* The rotations are implemented with:
* <<< 16 VREV32.16 for 16,
* <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR)
* <<< 8 TBL (general permutation; rot8 below stored in r)
* <<< 7 VSHL/VSRI/VORR
*/
.macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
vld1.32 {\c2-\c3}, [fp, :256]
.endm
.macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
/* a += b; d ^= a; d <<<= 16 */
vadd.u32 \a0, \a0, \b0
vadd.u32 \a1, \a1, \b1
vadd.u32 \a2, \a2, \b2
vadd.u32 \a3, \a3, \b3
veor \d0, \d0, \a0
veor \d1, \d1, \a1
veor \d2, \d2, \a2
veor \d3, \d3, \a3
vrev32.16 \d0, \d0
vrev32.16 \d1, \d1
vrev32.16 \d2, \d2
vrev32.16 \d3, \d3
/* c += d; b ^= c; b <<<= 12 */
vadd.u32 \c0, \c0, \d0
vadd.u32 \c1, \c1, \d1
vadd.u32 \c2, \c2, \d2
vadd.u32 \c3, \c3, \d3
vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */
veor \c0, \b0, \c0
veor \c1, \b1, \c1
vshl.u32 \b0, \c0, #12
vshl.u32 \b1, \c1, #12
vsri.u32 \b0, \c0, #(32 - 12)
vsri.u32 \b1, \c1, #(32 - 12)
veor \c0, \b2, \c2
veor \c1, \b3, \c3
vshl.u32 \b2, \c0, #12
vshl.u32 \b3, \c1, #12
vsri.u32 \b2, \c0, #(32 - 12)
vsri.u32 \b3, \c1, #(32 - 12)
vld1.8 {\c0l}, [r7, :64] /* load rot8 table */
/* a += b; d ^= a; d <<<= 8 */
vadd.u32 \a0, \a0, \b0
vadd.u32 \a1, \a1, \b1
vadd.u32 \a2, \a2, \b2
vadd.u32 \a3, \a3, \b3
veor \d0, \d0, \a0
veor \d1, \d1, \a1
veor \d2, \d2, \a2
veor \d3, \d3, \a3
vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */
vtbl.8 \d0h, {\d0h}, \c0l
vtbl.8 \d1l, {\d1l}, \c0l
vtbl.8 \d1h, {\d1h}, \c0l
vtbl.8 \d2l, {\d2l}, \c0l
vtbl.8 \d2h, {\d2h}, \c0l
vtbl.8 \d3l, {\d3l}, \c0l
vtbl.8 \d3h, {\d3h}, \c0l
vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */
/* c += d; b ^= c; b <<<= 7 */
vadd.u32 \c2, \c2, \d2
vadd.u32 \c3, \c3, \d3
vadd.u32 \c0, \c0, \d0
vadd.u32 \c1, \c1, \d1
vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */
veor \c2, \b2, \c2
veor \c3, \b3, \c3
vshl.u32 \b2, \c2, #7
vshl.u32 \b3, \c3, #7
vsri.u32 \b2, \c2, #(32 - 7)
vsri.u32 \b3, \c3, #(32 - 7)
veor \c2, \b0, \c0
veor \c3, \b1, \c1
vshl.u32 \b0, \c2, #7
vshl.u32 \b1, \c3, #7
vsri.u32 \b0, \c2, #(32 - 7)
vsri.u32 \b1, \c3, #(32 - 7)
.endm
#if _BYTE_ORDER == _LITTLE_ENDIAN
#define HTOLE32(x)
#define LE32TOH(x)
#elif _BYTE_ORDER == _BIG_ENDIAN
#define HTOLE32(x) vrev32.8 x, x
#define LE32TOH(x) vrev32.8 x, x
#endif
.text
.p2align 2
.Lconstants_addr:
.long .Lconstants - .
/*
* chacha_stream256_neon(uint8_t s[256]@r0,
* uint32_t blkno@r1,
* const uint8_t nonce[12]@r2,
* const uint8_t key[32]@r3,
* const uint8_t const[16]@sp[0],
* unsigned nr@sp[4])
*/
ENTRY(chacha_stream256_neon)
/* save callee-saves registers */
push {r4, r5, r6, r7, r8, r10, fp, lr}
vpush {d8-d15}
/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
ldr r7, .Lconstants_addr
adr r6, .Lconstants_addr
/* reserve space for two 128-bit/16-byte q registers */
sub fp, sp, #0x20
bic fp, fp, #0x1f /* align */
/* get parameters */
add ip, sp, #96
add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
ldm ip, {r4, r5} /* r4 := const, r5 := nr */
ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
vld1.32 {q12}, [r4] /* q12 := constant */
vld1.32 {q13-q14}, [r3] /* q13-q14 := key */
vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
vdup.32 q0, d24[0] /* q0-q3 := constant */
vdup.32 q1, d24[1]
vdup.32 q2, d25[0]
vdup.32 q3, d25[1]
vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */
vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
vdup.32 q5, d26[1]
vdup.32 q6, d27[0]
vdup.32 q7, d27[1]
vdup.32 q8, d28[0]
vdup.32 q9, d28[1]
vdup.32 q10, d29[0]
vdup.32 q11, d29[1]
vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
vdup.32 q13, r6 /* q13-q15 := nonce */
vdup.32 q14, r8
vdup.32 q15, r10
HTOLE32(q0)
HTOLE32(q1)
HTOLE32(q2)
HTOLE32(q3)
HTOLE32(q4)
HTOLE32(q5)
HTOLE32(q6)
HTOLE32(q7)
HTOLE32(q8)
HTOLE32(q9)
HTOLE32(q10)
HTOLE32(q11)
HTOLE32(q12)
HTOLE32(q13)
HTOLE32(q14)
HTOLE32(q15)
b 2f
_ALIGN_TEXT
1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
2: subs r5, r5, #2
ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
d16, d24,d25, d26,d27, d28,d29, d30,d31
ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
d20, d30,d31, d24,d25, d26,d27, d28,d29
bne 1b
/*
* q8-q9 are free / saved on the stack. We have:
*
* q0 = (x0[0], x1[0]; x2[0], x3[0])
* q1 = (x0[1], x1[1]; x2[1], x3[1])
* q2 = (x0[2], x1[2]; x2[2], x3[2])
* q3 = (x0[3], x1[3]; x2[3], x3[3])
* ...
* q15 = (x0[15], x1[15]; x2[15], x3[15])
*
* where xi[j] is the jth word of the ith 16-word block. Zip
* consecutive pairs with vzip.32, and you get:
*
* q0 = (x0[0], x0[1]; x1[0], x1[1])
* q1 = (x2[0], x2[1]; x3[0], x3[1])
* q2 = (x0[2], x0[3]; x1[2], x1[3])
* q3 = (x2[2], x2[3]; x3[2], x3[3])
* ...
* q15 = (x2[14], x2[15]; x3[14], x3[15])
*
* As 64-bit d registers, this is:
*
* d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1])
* d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1])
* d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3])
* d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3])
* ...
* d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15])
*
* Swap d1<->d4, d3<->d6, ..., and you get:
*
* q0 = (x0[0], x0[1]; x0[2], x0[3])
* q1 = (x2[0], x2[1]; x2[2], x2[3])
* q2 = (x1[0], x1[1]; x1[2], x1[3])
* q3 = (x3[0], x3[1]; x3[2], x3[3])
* ...
* q15 = (x15[0], x15[1]; x15[2], x15[3])
*/
sub r7, r7, #0x10
vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */
vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
vzip.32 q0, q1
vzip.32 q2, q3
vzip.32 q4, q5
vzip.32 q6, q7
vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
vld1.32 {q9}, [r4] /* q9 := constant */
vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
vld1.32 {q8}, [r3]! /* q8 := key[0:16) */
vswp d1, d4
vswp d9, d12
vswp d3, d6
vswp d11, d14
/*
* At this point, the blocks are:
*
* q0 = (x0[0], x0[1]; x0[2], x0[3])
* q1 = (x2[0], x2[1]; x2[2], x2[3])
* q2 = (x1[0], x1[1]; x1[2], x1[3])
* q3 = (x3[0], x3[1]; x3[2], x3[3])
* q4 = (x0[4], x0[5]; x0[6], x0[7])
* q5 = (x2[4], x2[5]; x2[6], x2[7])
* q6 = (x1[4], x1[5]; x1[6], x1[7])
* q7 = (x3[4], x3[5]; x3[6], x3[7])
*
* The first two rows to write out are q0 = x0[0:4) and q4 =
* x0[4:8). If we first swap q1 and q4, then once we've
* written them out we free up consecutive registers q0-q1 for
* store-multiple.
*/
vswp q1, q4
vadd.u32 q0, q0, q9
vadd.u32 q4, q4, q9
vadd.u32 q2, q2, q9
vadd.u32 q3, q3, q9
vadd.u32 q1, q1, q8
vadd.u32 q5, q5, q8
vadd.u32 q6, q6, q8
vadd.u32 q7, q7, q8
vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */
LE32TOH(q0)
LE32TOH(q1)
LE32TOH(q2)
LE32TOH(q3)
LE32TOH(q4)
LE32TOH(q5)
LE32TOH(q6)
LE32TOH(q7)
vst1.32 {q0-q1}, [r0]!
vld1.32 {q0}, [r3] /* q0 := key[16:32) */
mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
vmov d2, r3, r6
vmov d3, r8, r10
vzip.32 q8, q9
vzip.32 q10, q11
vzip.32 q12, q13
vzip.32 q14, q15
vswp d17, d20
vswp d25, d28
vswp d19, d22
vswp d27, d30
vadd.u32 q8, q8, q0
vadd.u32 q9, q9, q0
vadd.u32 q10, q10, q0
vadd.u32 q11, q11, q0
vadd.u32 q12, q12, q1
vadd.u32 q13, q13, q1
vadd.u32 q14, q14, q1
vadd.u32 q15, q15, q1
LE32TOH(q8)
LE32TOH(q9)
LE32TOH(q10)
LE32TOH(q11)
LE32TOH(q12)
LE32TOH(q13)
LE32TOH(q14)
LE32TOH(q15)
/* prepare to zero temporary space on stack */
vmov.i32 q0, #0
vmov.i32 q1, #0
/* vst1.32 {q0}, [r0]! */
/* vst1.32 {q1}, [r0]! */ /* (was q4 before vswp) */
vst1.32 {q8}, [r0]!
vst1.32 {q12}, [r0]!
vst1.32 {q2}, [r0]!
vst1.32 {q6}, [r0]!
vst1.32 {q10}, [r0]!
vst1.32 {q14}, [r0]!
vst1.32 {q4}, [r0]! /* (was q1 before vswp) */
vst1.32 {q5}, [r0]!
vst1.32 {q9}, [r0]!
vst1.32 {q13}, [r0]!
vst1.32 {q3}, [r0]!
vst1.32 {q7}, [r0]!
vst1.32 {q11}, [r0]!
vst1.32 {q15}, [r0]
/* zero temporary space on the stack */
vst1.8 {q0-q1}, [fp, :256]
/* restore callee-saves registers and stack */
vpop {d8-d15}
pop {r4, r5, r6, r7, r8, r10, fp, lr}
bx lr
END(chacha_stream256_neon)
/*
* chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
* uint32_t blkno@r2,
* const uint8_t nonce[12]@r3,
* const uint8_t key[32]@sp[0],
* const uint8_t const[16]@sp[4],
* unsigned nr@sp[8])
*/
ENTRY(chacha_stream_xor256_neon)
/* save callee-saves registers */
push {r4, r5, r6, r7, r8, r10, fp, lr}
vpush {d8-d15}
/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
ldr r7, .Lconstants_addr
adr r6, .Lconstants_addr
/* reserve space for two 128-bit/16-byte q registers */
sub fp, sp, #0x20
bic fp, fp, #0x1f /* align */
/* get parameters */
add ip, sp, #96
add r7, r7, r6 /* r7 := .Lconstants (= v0123) */
ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */
ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */
vld1.32 {q12}, [r5] /* q12 := constant */
vld1.32 {q13-q14}, [r4] /* q13-q14 := key */
vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
vdup.32 q0, d24[0] /* q0-q3 := constant */
vdup.32 q1, d24[1]
vdup.32 q2, d25[0]
vdup.32 q3, d25[1]
vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */
vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */
vdup.32 q5, d26[1]
vdup.32 q6, d27[0]
vdup.32 q7, d27[1]
vdup.32 q8, d28[0]
vdup.32 q9, d28[1]
vdup.32 q10, d29[0]
vdup.32 q11, d29[1]
vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
vdup.32 q13, r6 /* q13-q15 := nonce */
vdup.32 q14, r8
vdup.32 q15, r10
HTOLE32(q0)
HTOLE32(q1)
HTOLE32(q2)
HTOLE32(q3)
HTOLE32(q4)
HTOLE32(q5)
HTOLE32(q6)
HTOLE32(q7)
HTOLE32(q8)
HTOLE32(q9)
HTOLE32(q10)
HTOLE32(q11)
HTOLE32(q12)
HTOLE32(q13)
HTOLE32(q14)
HTOLE32(q15)
b 2f
_ALIGN_TEXT
1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
2: subs ip, ip, #2
ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
d16, d24,d25, d26,d27, d28,d29, d30,d31
ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
d20, d30,d31, d24,d25, d26,d27, d28,d29
bne 1b
/*
* q8-q9 are free / saved on the stack. Now for the real fun:
* in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
* {0,1,2,...,15}. The twist is that the p[i] and the y[i] are
* transposed from one another, and the x[i] are in general
* registers and memory. So we have:
*
* q0 = (x0[0], x1[0]; x2[0], x3[0])
* q1 = (x0[1], x1[1]; x2[1], x3[1])
* q2 = (x0[2], x1[2]; x2[2], x3[2])
* q3 = (x0[3], x1[3]; x2[3], x3[3])
* ...
* q15 = (x0[15], x1[15]; x2[15], x3[15])
*
* where xi[j] is the jth word of the ith 16-word block. Zip
* consecutive pairs with vzip.32, and you get:
*
* q0 = (x0[0], x0[1]; x1[0], x1[1])
* q1 = (x2[0], x2[1]; x3[0], x3[1])
* q2 = (x0[2], x0[3]; x1[2], x1[3])
* q3 = (x2[2], x2[3]; x3[2], x3[3])
* ...
* q15 = (x2[14], x2[15]; x3[14], x3[15])
*
* As 64-bit d registers, this is:
*
* d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1])
* d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1])
* d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3])
* d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3])
* ...
* d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15])
*
* Swap d1<->d4, d3<->d6, ..., and you get:
*
* q0 = (x0[0], x0[1]; x0[2], x0[3])
* q1 = (x2[0], x2[1]; x2[2], x2[3])
* q2 = (x1[0], x1[1]; x1[2], x1[3])
* q3 = (x3[0], x3[1]; x3[2], x3[3])
* ...
* q15 = (x15[0], x15[1]; x15[2], x15[3])
*/
sub r7, r7, #0x10
vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */
vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
vzip.32 q0, q1
vzip.32 q2, q3
vzip.32 q4, q5
vzip.32 q6, q7
vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
vld1.32 {q9}, [r5] /* q9 := constant */
vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
vld1.32 {q8}, [r4]! /* q8 := key[0:16) */
vswp d1, d4
vswp d9, d12
vswp d3, d6
vswp d11, d14
/*
* At this point, the blocks are:
*
* q0 = (x0[0], x0[1]; x0[2], x0[3])
* q1 = (x2[0], x2[1]; x2[2], x2[3])
* q2 = (x1[0], x1[1]; x1[2], x1[3])
* q3 = (x3[0], x3[1]; x3[2], x3[3])
* q4 = (x0[4], x0[5]; x0[6], x0[7])
* q5 = (x2[4], x2[5]; x2[6], x2[7])
* q6 = (x1[4], x1[5]; x1[6], x1[7])
* q7 = (x3[4], x3[5]; x3[6], x3[7])
*
* The first two rows to write out are q0 = x0[0:4) and q4 =
* x0[4:8). If we first swap q1 and q4, then once we've
* written them out we free up consecutive registers q0-q1 for
* store-multiple.
*/
vswp q1, q4
vadd.u32 q0, q0, q9
vadd.u32 q4, q4, q9
vadd.u32 q2, q2, q9
vadd.u32 q3, q3, q9
vadd.u32 q1, q1, q8
vadd.u32 q5, q5, q8
vadd.u32 q6, q6, q8
vadd.u32 q7, q7, q8
vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */
LE32TOH(q0)
LE32TOH(q1)
LE32TOH(q2)
LE32TOH(q6)
LE32TOH(q4)
LE32TOH(q5)
LE32TOH(q3)
LE32TOH(q7)
veor q0, q0, q8 /* compute ciphertext bytes [0:32) */
veor q1, q1, q9
vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */
vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */
vld1.32 {q0}, [r4] /* q0 := key[16:32) */
mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
vmov d2, r3, r6
vmov d3, r8, r10
vzip.32 q8, q9
vzip.32 q10, q11
vzip.32 q12, q13
vzip.32 q14, q15
vswp d17, d20
vswp d25, d28
vswp d19, d22
vswp d27, d30
vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */
vadd.u32 q8, q8, q0
vadd.u32 q12, q12, q0
vadd.u32 q10, q10, q0
vadd.u32 q11, q11, q0
vadd.u32 q9, q9, q1
vadd.u32 q13, q13, q1
vadd.u32 q14, q14, q1
vadd.u32 q15, q15, q1
vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */
LE32TOH(q8)
LE32TOH(q9)
LE32TOH(q10)
LE32TOH(q14)
LE32TOH(q12)
LE32TOH(q13)
LE32TOH(q11)
LE32TOH(q15)
veor q0, q0, q8 /* compute ciphertext bytes [32:64) */
veor q1, q1, q9
vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */
vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */
vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */
veor q2, q2, q8 /* compute ciphertext bytes [64:96) */
veor q6, q6, q9
vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */
vst1.32 {q2}, [r0]! /* store ciphertext bytes [64:80) */
veor q10, q10, q0 /* compute ciphertext bytes [96:128) */
veor q14, q14, q1
vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */
vst1.32 {q6}, [r0]! /* store ciphertext bytes [80:96) */
veor q4, q4, q8 /* compute ciphertext bytes [128:160) */
veor q5, q5, q9
vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */
vst1.32 {q10}, [r0]! /* store ciphertext bytes [96:112) */
veor q12, q12, q0 /* compute ciphertext bytes [160:192) */
veor q13, q13, q1
vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */
vst1.32 {q14}, [r0]! /* store ciphertext bytes [112:128) */
veor q8, q3, q8 /* compute ciphertext bytes [192:224) */
veor q9, q7, q9
vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [128:160) */
vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [160:192) */
veor q0, q11, q0 /* compute ciphertext bytes [224:256) */
veor q1, q15, q1
vst1.32 {q8-q9}, [r0]! /* store ciphertext bytes [192:224) */
vst1.32 {q0-q1}, [r0] /* store ciphertext bytes [224:256) */
/* zero temporary space on the stack */
vmov.i32 q0, #0
vmov.i32 q1, #0
vst1.8 {q0-q1}, [fp, :256]
/* restore callee-saves registers and stack */
vpop {d8-d15}
pop {r4, r5, r6, r7, r8, r10, fp, lr}
bx lr
END(chacha_stream_xor256_neon)
.section .rodata
.p2align 4
.Lconstants:
.type v0123,%object
v0123:
.long 0, 1, 2, 3
END(v0123)
.type rot8,%object
rot8:
.long 0x02010003, 0x06050407
END(rot8)
--- src/tests/sys/crypto/chacha/Makefile 2020/07/27 20:51:29 1.2
+++ src/tests/sys/crypto/chacha/Makefile 2020/07/28 20:08:48 1.3
| @@ -1,45 +1,47 @@ | | | @@ -1,45 +1,47 @@ |
1 | # $NetBSD: Makefile,v 1.2 2020/07/27 20:51:29 riastradh Exp $ | | 1 | # $NetBSD: Makefile,v 1.3 2020/07/28 20:08:48 riastradh Exp $ |
2 | | | 2 | |
3 | .include <bsd.own.mk> | | 3 | .include <bsd.own.mk> |
4 | | | 4 | |
5 | TESTSDIR= ${TESTSBASE}/sys/crypto/chacha | | 5 | TESTSDIR= ${TESTSBASE}/sys/crypto/chacha |
6 | | | 6 | |
7 | TESTS_C= t_chacha | | 7 | TESTS_C= t_chacha |
8 | | | 8 | |
9 | AFLAGS+= -D_LOCORE | | 9 | AFLAGS+= -D_LOCORE |
10 | | | 10 | |
11 | .PATH: ${NETBSDSRCDIR}/sys/crypto/chacha | | 11 | .PATH: ${NETBSDSRCDIR}/sys/crypto/chacha |
12 | CPPFLAGS+= -I${NETBSDSRCDIR}/sys | | 12 | CPPFLAGS+= -I${NETBSDSRCDIR}/sys |
13 | | | 13 | |
14 | SRCS.t_chacha+= t_chacha.c | | 14 | SRCS.t_chacha+= t_chacha.c |
15 | | | 15 | |
16 | SRCS.t_chacha+= chacha_ref.c | | 16 | SRCS.t_chacha+= chacha_ref.c |
17 | SRCS.t_chacha+= chacha_selftest.c | | 17 | SRCS.t_chacha+= chacha_selftest.c |
18 | | | 18 | |
19 | .if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*) | | 19 | .if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*) |
20 | | | 20 | |
21 | .PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm | | 21 | .PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm |
22 | CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm | | 22 | CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm |
23 | | | 23 | |
24 | SRCS.t_chacha+= chacha_neon.c | | 24 | SRCS.t_chacha+= chacha_neon.c |
25 | .if !empty(MACHINE_ARCH:Maarch64*) | | 25 | .if !empty(MACHINE_ARCH:Mearmv7*) |
| | | 26 | SRCS.t_chacha+= chacha_neon_32.S |
| | | 27 | .elif !empty(MACHINE_ARCH:Maarch64*) |
26 | SRCS.t_chacha+= chacha_neon_64.S | | 28 | SRCS.t_chacha+= chacha_neon_64.S |
27 | .endif | | 29 | .endif |
28 | SRCS.t_chacha+= chacha_neon_impl.c | | 30 | SRCS.t_chacha+= chacha_neon_impl.c |
29 | | | 31 | |
30 | .endif # earmv7 or aarch64 | | 32 | .endif # earmv7 or aarch64 |
31 | | | 33 | |
32 | .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64" | | 34 | .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64" |
33 | | | 35 | |
34 | .PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 | | 36 | .PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 |
35 | CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 | | 37 | CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 |
36 | | | 38 | |
37 | SRCS.t_chacha+= chacha_sse2.c | | 39 | SRCS.t_chacha+= chacha_sse2.c |
38 | SRCS.t_chacha+= chacha_sse2_impl.c | | 40 | SRCS.t_chacha+= chacha_sse2_impl.c |
39 | COPTS.chacha_sse2.c+= -msse -msse2 | | 41 | COPTS.chacha_sse2.c+= -msse -msse2 |
40 | | | 42 | |
41 | .endif # x86 | | 43 | .endif # x86 |
42 | | | 44 | |
43 | WARNS= 5 | | 45 | WARNS= 5 |
44 | | | 46 | |
45 | .include <bsd.test.mk> | | 47 | .include <bsd.test.mk> |