| @@ -1,692 +1,641 @@ | | | @@ -1,692 +1,641 @@ |
1 | /* $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $ */ | | 1 | /* $NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
15 | * | | 15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. | | 26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ | | 27 | */ |
28 | | | 28 | |
29 | #include <machine/asm.h> | | 29 | #include <machine/asm.h> |
30 | | | 30 | |
31 | RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $") | | 31 | RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $") |
32 | | | 32 | |
33 | .fpu neon | | 33 | .fpu neon |
34 | | | 34 | |
35 | /* | | 35 | /* |
36 | * ChaCha round, split up so we can interleave the quarterrounds on | | 36 | * ChaCha round, split up so we can interleave the quarterrounds on |
37 | * independent rows/diagonals to maximize pipeline efficiency, with | | 37 | * independent rows/diagonals to maximize pipeline efficiency, with |
38 | * spills to deal with the scarcity of registers. Reference: | | 38 | * spills to deal with the scarcity of registers. Reference: |
39 | * | | 39 | * |
40 | * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop | | 40 | * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop |
41 | * Record of the State of the Art in Stream Ciphers -- SASC 2008. | | 41 | * Record of the State of the Art in Stream Ciphers -- SASC 2008. |
42 | * https://cr.yp.to/papers.html#chacha | | 42 | * https://cr.yp.to/papers.html#chacha |
43 | * | | 43 | * |
44 | * a += b; d ^= a; d <<<= 16; | | 44 | * a += b; d ^= a; d <<<= 16; |
45 | * c += d; b ^= c; b <<<= 12; | | 45 | * c += d; b ^= c; b <<<= 12; |
46 | * a += b; d ^= a; d <<<= 8; | | 46 | * a += b; d ^= a; d <<<= 8; |
47 | * c += d; b ^= c; b <<<= 7; | | 47 | * c += d; b ^= c; b <<<= 7; |
48 | * | | 48 | * |
49 | * The rotations are implemented with: | | 49 | * The rotations are implemented with: |
50 | * <<< 16 VREV32.16 for 16, | | 50 | * <<< 16 VREV32.16 for 16, |
51 | * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR) | | 51 | * <<< 12 VSHL/VSRI/VORR (shift left, shift right and insert, OR) |
52 | * <<< 8 TBL (general permutation; rot8 below stored in r) | | 52 | * <<< 8 TBL (general permutation; rot8 below stored in r) |
53 | * <<< 7 VSHL/VSRI/VORR | | 53 | * <<< 7 VSHL/VSRI/VORR |
54 | */ | | 54 | */ |
55 | | | 55 | |
56 | .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 | | 56 | .macro ROUNDLD a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3 |
57 | vld1.32 {\c2-\c3}, [fp, :256] | | 57 | vld1.32 {\c2-\c3}, [fp, :256] |
58 | .endm | | 58 | .endm |
59 | | | 59 | |
60 | .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h | | 60 | .macro ROUND a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h |
61 | /* a += b; d ^= a; d <<<= 16 */ | | 61 | /* a += b; d ^= a; d <<<= 16 */ |
62 | vadd.u32 \a0, \a0, \b0 | | 62 | vadd.u32 \a0, \a0, \b0 |
63 | vadd.u32 \a1, \a1, \b1 | | 63 | vadd.u32 \a1, \a1, \b1 |
64 | vadd.u32 \a2, \a2, \b2 | | 64 | vadd.u32 \a2, \a2, \b2 |
65 | vadd.u32 \a3, \a3, \b3 | | 65 | vadd.u32 \a3, \a3, \b3 |
66 | | | 66 | |
67 | veor \d0, \d0, \a0 | | 67 | veor \d0, \d0, \a0 |
68 | veor \d1, \d1, \a1 | | 68 | veor \d1, \d1, \a1 |
69 | veor \d2, \d2, \a2 | | 69 | veor \d2, \d2, \a2 |
70 | veor \d3, \d3, \a3 | | 70 | veor \d3, \d3, \a3 |
71 | | | 71 | |
72 | vrev32.16 \d0, \d0 | | 72 | vrev32.16 \d0, \d0 |
73 | vrev32.16 \d1, \d1 | | 73 | vrev32.16 \d1, \d1 |
74 | vrev32.16 \d2, \d2 | | 74 | vrev32.16 \d2, \d2 |
75 | vrev32.16 \d3, \d3 | | 75 | vrev32.16 \d3, \d3 |
76 | | | 76 | |
77 | /* c += d; b ^= c; b <<<= 12 */ | | 77 | /* c += d; b ^= c; b <<<= 12 */ |
78 | vadd.u32 \c0, \c0, \d0 | | 78 | vadd.u32 \c0, \c0, \d0 |
79 | vadd.u32 \c1, \c1, \d1 | | 79 | vadd.u32 \c1, \c1, \d1 |
80 | vadd.u32 \c2, \c2, \d2 | | 80 | vadd.u32 \c2, \c2, \d2 |
81 | vadd.u32 \c3, \c3, \d3 | | 81 | vadd.u32 \c3, \c3, \d3 |
82 | | | 82 | |
83 | vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ | | 83 | vst1.32 {\c0-\c1}, [fp, :256] /* free c0 and c1 as temps */ |
84 | | | 84 | |
85 | veor \c0, \b0, \c0 | | 85 | veor \c0, \b0, \c0 |
86 | veor \c1, \b1, \c1 | | 86 | veor \c1, \b1, \c1 |
87 | vshl.u32 \b0, \c0, #12 | | 87 | vshl.u32 \b0, \c0, #12 |
88 | vshl.u32 \b1, \c1, #12 | | 88 | vshl.u32 \b1, \c1, #12 |
89 | vsri.u32 \b0, \c0, #(32 - 12) | | 89 | vsri.u32 \b0, \c0, #(32 - 12) |
90 | vsri.u32 \b1, \c1, #(32 - 12) | | 90 | vsri.u32 \b1, \c1, #(32 - 12) |
91 | | | 91 | |
92 | veor \c0, \b2, \c2 | | 92 | veor \c0, \b2, \c2 |
93 | veor \c1, \b3, \c3 | | 93 | veor \c1, \b3, \c3 |
94 | vshl.u32 \b2, \c0, #12 | | 94 | vshl.u32 \b2, \c0, #12 |
95 | vshl.u32 \b3, \c1, #12 | | 95 | vshl.u32 \b3, \c1, #12 |
96 | vsri.u32 \b2, \c0, #(32 - 12) | | 96 | vsri.u32 \b2, \c0, #(32 - 12) |
97 | vsri.u32 \b3, \c1, #(32 - 12) | | 97 | vsri.u32 \b3, \c1, #(32 - 12) |
98 | | | 98 | |
99 | vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ | | 99 | vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ |
100 | | | 100 | |
101 | /* a += b; d ^= a; d <<<= 8 */ | | 101 | /* a += b; d ^= a; d <<<= 8 */ |
102 | vadd.u32 \a0, \a0, \b0 | | 102 | vadd.u32 \a0, \a0, \b0 |
103 | vadd.u32 \a1, \a1, \b1 | | 103 | vadd.u32 \a1, \a1, \b1 |
104 | vadd.u32 \a2, \a2, \b2 | | 104 | vadd.u32 \a2, \a2, \b2 |
105 | vadd.u32 \a3, \a3, \b3 | | 105 | vadd.u32 \a3, \a3, \b3 |
106 | | | 106 | |
107 | veor \d0, \d0, \a0 | | 107 | veor \d0, \d0, \a0 |
108 | veor \d1, \d1, \a1 | | 108 | veor \d1, \d1, \a1 |
109 | veor \d2, \d2, \a2 | | 109 | veor \d2, \d2, \a2 |
110 | veor \d3, \d3, \a3 | | 110 | veor \d3, \d3, \a3 |
111 | | | 111 | |
112 | vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */ | | 112 | vtbl.8 \d0l, {\d0l}, \c0l /* <<< 8 */ |
113 | vtbl.8 \d0h, {\d0h}, \c0l | | 113 | vtbl.8 \d0h, {\d0h}, \c0l |
114 | vtbl.8 \d1l, {\d1l}, \c0l | | 114 | vtbl.8 \d1l, {\d1l}, \c0l |
115 | vtbl.8 \d1h, {\d1h}, \c0l | | 115 | vtbl.8 \d1h, {\d1h}, \c0l |
116 | vtbl.8 \d2l, {\d2l}, \c0l | | 116 | vtbl.8 \d2l, {\d2l}, \c0l |
117 | vtbl.8 \d2h, {\d2h}, \c0l | | 117 | vtbl.8 \d2h, {\d2h}, \c0l |
118 | vtbl.8 \d3l, {\d3l}, \c0l | | 118 | vtbl.8 \d3l, {\d3l}, \c0l |
119 | vtbl.8 \d3h, {\d3h}, \c0l | | 119 | vtbl.8 \d3h, {\d3h}, \c0l |
120 | | | 120 | |
121 | vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ | | 121 | vld1.32 {\c0-\c1}, [fp, :256] /* restore c0 and c1 */ |
122 | | | 122 | |
123 | /* c += d; b ^= c; b <<<= 7 */ | | 123 | /* c += d; b ^= c; b <<<= 7 */ |
124 | vadd.u32 \c2, \c2, \d2 | | 124 | vadd.u32 \c2, \c2, \d2 |
125 | vadd.u32 \c3, \c3, \d3 | | 125 | vadd.u32 \c3, \c3, \d3 |
126 | vadd.u32 \c0, \c0, \d0 | | 126 | vadd.u32 \c0, \c0, \d0 |
127 | vadd.u32 \c1, \c1, \d1 | | 127 | vadd.u32 \c1, \c1, \d1 |
128 | | | 128 | |
129 | vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ | | 129 | vst1.32 {\c2-\c3}, [fp, :256] /* free c2 and c3 as temps */ |
130 | | | 130 | |
131 | veor \c2, \b2, \c2 | | 131 | veor \c2, \b2, \c2 |
132 | veor \c3, \b3, \c3 | | 132 | veor \c3, \b3, \c3 |
133 | vshl.u32 \b2, \c2, #7 | | 133 | vshl.u32 \b2, \c2, #7 |
134 | vshl.u32 \b3, \c3, #7 | | 134 | vshl.u32 \b3, \c3, #7 |
135 | vsri.u32 \b2, \c2, #(32 - 7) | | 135 | vsri.u32 \b2, \c2, #(32 - 7) |
136 | vsri.u32 \b3, \c3, #(32 - 7) | | 136 | vsri.u32 \b3, \c3, #(32 - 7) |
137 | | | 137 | |
138 | veor \c2, \b0, \c0 | | 138 | veor \c2, \b0, \c0 |
139 | veor \c3, \b1, \c1 | | 139 | veor \c3, \b1, \c1 |
140 | vshl.u32 \b0, \c2, #7 | | 140 | vshl.u32 \b0, \c2, #7 |
141 | vshl.u32 \b1, \c3, #7 | | 141 | vshl.u32 \b1, \c3, #7 |
142 | vsri.u32 \b0, \c2, #(32 - 7) | | 142 | vsri.u32 \b0, \c2, #(32 - 7) |
143 | vsri.u32 \b1, \c3, #(32 - 7) | | 143 | vsri.u32 \b1, \c3, #(32 - 7) |
144 | .endm | | 144 | .endm |
145 | | | 145 | |
146 | #if _BYTE_ORDER == _LITTLE_ENDIAN | | 146 | #if _BYTE_ORDER == _LITTLE_ENDIAN |
147 | #define HTOLE32(x) | | 147 | #define HTOLE32(x) |
148 | #define LE32TOH(x) | | 148 | #define LE32TOH(x) |
149 | #elif _BYTE_ORDER == _BIG_ENDIAN | | 149 | #elif _BYTE_ORDER == _BIG_ENDIAN |
150 | #define HTOLE32(x) vrev32.8 x, x | | 150 | #define HTOLE32(x) vrev32.8 x, x |
151 | #define LE32TOH(x) vrev32.8 x, x | | 151 | #define LE32TOH(x) vrev32.8 x, x |
152 | #endif | | 152 | #endif |
153 | | | 153 | |
154 | .text | | 154 | .text |
155 | .p2align 2 | | 155 | .p2align 2 |
156 | .Lconstants_addr: | | 156 | .Lconstants_addr: |
157 | .long .Lconstants - . | | 157 | .long .Lconstants - . |
158 | | | 158 | |
159 | /* | | 159 | /* |
160 | * chacha_stream256_neon(uint8_t s[256]@r0, | | 160 | * chacha_stream256_neon(uint8_t s[256]@r0, |
161 | * uint32_t blkno@r1, | | 161 | * uint32_t blkno@r1, |
162 | * const uint8_t nonce[12]@r2, | | 162 | * const uint8_t nonce[12]@r2, |
163 | * const uint8_t key[32]@r3, | | 163 | * const uint8_t key[32]@r3, |
164 | * const uint8_t const[16]@sp[0], | | 164 | * const uint8_t const[16]@sp[0], |
165 | * unsigned nr@sp[4]) | | 165 | * unsigned nr@sp[4]) |
166 | */ | | 166 | */ |
167 | ENTRY(chacha_stream256_neon) | | 167 | ENTRY(chacha_stream256_neon) |
168 | /* save callee-saves registers */ | | 168 | /* save callee-saves registers */ |
169 | push {r4, r5, r6, r7, r8, r10, fp, lr} | | 169 | push {r4, r5, r6, r7, r8, r10, fp, lr} |
170 | vpush {d8-d15} | | 170 | vpush {d8-d15} |
171 | | | 171 | |
172 | /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ | | 172 | /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ |
173 | ldr r7, .Lconstants_addr | | 173 | ldr r7, .Lconstants_addr |
174 | adr r6, .Lconstants_addr | | 174 | adr r6, .Lconstants_addr |
175 | | | 175 | |
176 | /* reserve space for two 128-bit/16-byte q registers */ | | 176 | /* reserve space for two 128-bit/16-byte q registers */ |
177 | sub fp, sp, #0x20 | | 177 | sub fp, sp, #0x20 |
178 | bic fp, fp, #0x1f /* align */ | | 178 | bic fp, fp, #0x1f /* align */ |
179 | | | 179 | |
180 | /* get parameters */ | | 180 | /* get parameters */ |
181 | add ip, sp, #96 | | 181 | add ip, sp, #96 |
182 | add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ | | 182 | add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ |
183 | ldm ip, {r4, r5} /* r4 := const, r5 := nr */ | | 183 | ldm ip, {r4, r5} /* r4 := const, r5 := nr */ |
184 | ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ | | 184 | ldm r2, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ |
185 | | | 185 | |
186 | vld1.32 {q12}, [r4] /* q12 := constant */ | | 186 | vld1.32 {q12}, [r4] /* q12 := constant */ |
187 | vld1.32 {q13-q14}, [r3] /* q13-q14 := key */ | | 187 | vld1.32 {q13-q14}, [r3] /* q13-q14 := key */ |
188 | vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ | | 188 | vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ |
189 | | | 189 | |
190 | vdup.32 q0, d24[0] /* q0-q3 := constant */ | | 190 | vdup.32 q0, d24[0] /* q0-q3 := constant */ |
191 | vdup.32 q1, d24[1] | | 191 | vdup.32 q1, d24[1] |
192 | vdup.32 q2, d25[0] | | 192 | vdup.32 q2, d25[0] |
193 | vdup.32 q3, d25[1] | | 193 | vdup.32 q3, d25[1] |
194 | vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */ | | 194 | vdup.32 q12, r1 /* q12 := (blkno, blkno, blkno, blkno) */ |
195 | vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ | | 195 | vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ |
196 | vdup.32 q5, d26[1] | | 196 | vdup.32 q5, d26[1] |
197 | vdup.32 q6, d27[0] | | 197 | vdup.32 q6, d27[0] |
198 | vdup.32 q7, d27[1] | | 198 | vdup.32 q7, d27[1] |
199 | vdup.32 q8, d28[0] | | 199 | vdup.32 q8, d28[0] |
200 | vdup.32 q9, d28[1] | | 200 | vdup.32 q9, d28[1] |
201 | vdup.32 q10, d29[0] | | 201 | vdup.32 q10, d29[0] |
202 | vdup.32 q11, d29[1] | | 202 | vdup.32 q11, d29[1] |
203 | vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ | | 203 | vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ |
204 | vdup.32 q13, r6 /* q13-q15 := nonce */ | | 204 | vdup.32 q13, r6 /* q13-q15 := nonce */ |
205 | vdup.32 q14, r8 | | 205 | vdup.32 q14, r8 |
206 | vdup.32 q15, r10 | | 206 | vdup.32 q15, r10 |
207 | | | 207 | |
208 | HTOLE32(q0) | | 208 | HTOLE32(q0) |
209 | HTOLE32(q1) | | 209 | HTOLE32(q1) |
210 | HTOLE32(q2) | | 210 | HTOLE32(q2) |
211 | HTOLE32(q3) | | 211 | HTOLE32(q3) |
212 | HTOLE32(q4) | | 212 | HTOLE32(q4) |
213 | HTOLE32(q5) | | 213 | HTOLE32(q5) |
214 | HTOLE32(q6) | | 214 | HTOLE32(q6) |
215 | HTOLE32(q7) | | 215 | HTOLE32(q7) |
216 | HTOLE32(q8) | | 216 | HTOLE32(q8) |
217 | HTOLE32(q9) | | 217 | HTOLE32(q9) |
218 | HTOLE32(q10) | | 218 | HTOLE32(q10) |
219 | HTOLE32(q11) | | 219 | HTOLE32(q11) |
220 | HTOLE32(q12) | | 220 | HTOLE32(q12) |
221 | HTOLE32(q13) | | 221 | HTOLE32(q13) |
222 | HTOLE32(q14) | | 222 | HTOLE32(q14) |
223 | HTOLE32(q15) | | 223 | HTOLE32(q15) |
224 | | | 224 | |
225 | b 2f | | 225 | b 2f |
226 | | | 226 | |
227 | _ALIGN_TEXT | | 227 | _ALIGN_TEXT |
228 | 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 | | 228 | 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 |
229 | 2: subs r5, r5, #2 | | 229 | 2: subs r5, r5, #2 |
230 | ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ | | 230 | ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ |
231 | d16, d24,d25, d26,d27, d28,d29, d30,d31 | | 231 | d16, d24,d25, d26,d27, d28,d29, d30,d31 |
232 | ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 | | 232 | ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 |
233 | ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ | | 233 | ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ |
234 | d20, d30,d31, d24,d25, d26,d27, d28,d29 | | 234 | d20, d30,d31, d24,d25, d26,d27, d28,d29 |
235 | bne 1b | | 235 | bne 1b |
236 | | | 236 | |
237 | /* | | 237 | /* |
238 | * q8-q9 are free / saved on the stack. We have: | | 238 | * q8-q9 are free / saved on the stack. We have: |
239 | * | | 239 | * |
240 | * q0 = (x0[0], x1[0]; x2[0], x3[0]) | | 240 | * q0 = (x0[0], x1[0]; x2[0], x3[0]) |
241 | * q1 = (x0[1], x1[1]; x2[1], x3[1]) | | 241 | * q1 = (x0[1], x1[1]; x2[1], x3[1]) |
242 | * q2 = (x0[2], x1[2]; x2[2], x3[2]) | | 242 | * q2 = (x0[2], x1[2]; x2[2], x3[2]) |
243 | * q3 = (x0[3], x1[3]; x2[3], x3[3]) | | 243 | * q3 = (x0[3], x1[3]; x2[3], x3[3]) |
244 | * ... | | 244 | * ... |
245 | * q15 = (x0[15], x1[15]; x2[15], x3[15]) | | 245 | * q15 = (x0[15], x1[15]; x2[15], x3[15]) |
246 | * | | 246 | * |
247 | * where xi[j] is the jth word of the ith 16-word block. Zip | | 247 | * where xi[j] is the jth word of the ith 16-word block. Zip |
248 | * consecutive pairs with vzip.32, and you get: | | 248 | * consecutive pairs with vzip.32, and you get: |
249 | * | | 249 | * |
250 | * q0 = (x0[0], x0[1]; x1[0], x1[1]) | | 250 | * q0 = (x0[0], x0[1]; x1[0], x1[1]) |
251 | * q1 = (x2[0], x2[1]; x3[0], x3[1]) | | 251 | * q1 = (x2[0], x2[1]; x3[0], x3[1]) |
252 | * q2 = (x0[2], x0[3]; x1[2], x1[3]) | | 252 | * q2 = (x0[2], x0[3]; x1[2], x1[3]) |
253 | * q3 = (x2[2], x2[3]; x3[2], x3[3]) | | 253 | * q3 = (x2[2], x2[3]; x3[2], x3[3]) |
254 | * ... | | 254 | * ... |
255 | * q15 = (x2[14], x2[15]; x3[14], x3[15]) | | 255 | * q15 = (x2[14], x2[15]; x3[14], x3[15]) |
256 | * | | 256 | * |
257 | * As 64-bit d registers, this is: | | 257 | * As 64-bit d registers, this is: |
258 | * | | 258 | * |
259 | * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) | | 259 | * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) |
260 | * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) | | 260 | * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) |
261 | * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) | | 261 | * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) |
262 | * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) | | 262 | * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) |
263 | * ... | | 263 | * ... |
264 | * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) | | 264 | * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) |
265 | * | | 265 | * |
266 | * Swap d1<->d4, d3<->d6, ..., and you get: | | 266 | * Swap d1<->d4, d3<->d6, ..., and you get: |
267 | * | | 267 | * |
268 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | 268 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) |
269 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | 269 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) |
270 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | 270 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) |
271 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | 271 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) |
272 | * ... | | 272 | * ... |
273 | * q15 = (x15[0], x15[1]; x15[2], x15[3]) | | 273 | * q15 = (x15[0], x15[1]; x15[2], x15[3]) |
274 | */ | | 274 | */ |
275 | | | 275 | |
276 | sub r7, r7, #0x10 | | 276 | sub r7, r7, #0x10 |
277 | vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */ | | 277 | vdup.32 q8, r1 /* q8 := (blkno, blkno, blkno, blkno) */ |
278 | vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ | | 278 | vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ |
279 | | | 279 | |
280 | vzip.32 q0, q1 | | 280 | vzip.32 q0, q1 |
281 | vzip.32 q2, q3 | | 281 | vzip.32 q2, q3 |
282 | vzip.32 q4, q5 | | 282 | vzip.32 q4, q5 |
283 | vzip.32 q6, q7 | | 283 | vzip.32 q6, q7 |
284 | | | 284 | |
285 | vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ | | 285 | vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ |
286 | vld1.32 {q9}, [r4] /* q9 := constant */ | | 286 | vld1.32 {q9}, [r4] /* q9 := constant */ |
287 | vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ | | 287 | vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ |
288 | vld1.32 {q8}, [r3]! /* q8 := key[0:16) */ | | 288 | vld1.32 {q8}, [r3]! /* q8 := key[0:16) */ |
289 | | | 289 | |
290 | vswp d1, d4 | | 290 | vswp d1, d4 |
291 | vswp d9, d12 | | 291 | vswp d9, d12 |
292 | vswp d3, d6 | | 292 | vswp d3, d6 |
293 | vswp d11, d14 | | 293 | vswp d11, d14 |
294 | | | 294 | |
295 | /* | | 295 | /* |
296 | * At this point, the blocks are: | | 296 | * At this point, the blocks are: |
297 | * | | 297 | * |
298 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | 298 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) |
299 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | 299 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) |
300 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | 300 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) |
301 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | 301 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) |
302 | * q4 = (x0[4], x0[5]; x0[6], x0[7]) | | 302 | * q4 = (x0[4], x0[5]; x0[6], x0[7]) |
303 | * q5 = (x2[4], x2[5]; x2[6], x2[7]) | | 303 | * q5 = (x2[4], x2[5]; x2[6], x2[7]) |
304 | * q6 = (x1[4], x1[5]; x1[6], x1[7]) | | 304 | * q6 = (x1[4], x1[5]; x1[6], x1[7]) |
305 | * q7 = (x3[4], x3[5]; x3[6], x3[7]) | | 305 | * q7 = (x3[4], x3[5]; x3[6], x3[7]) |
306 | * | | 306 | * |
307 | * The first two rows to write out are q0 = x0[0:4) and q4 = | | 307 | * The first two rows to write out are q0 = x0[0:4) and q4 = |
308 | * x0[4:8). If we first swap q1 and q4, then once we've | | 308 | * x0[4:8). Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14 |
309 | * written them out we free up consecutive registers q0-q1 for | | 309 | * enables us to issue all stores in consecutive pairs: |
310 | * store-multiple. | | 310 | * x0 in q0-q1 |
| | | 311 | * x1 in q8-q9 |
| | | 312 | * x2 in q2-q3 |
| | | 313 | * x3 in q10-q11 |
| | | 314 | * x4 in q4-q5 |
| | | 315 | * x5 in q12-q3 |
| | | 316 | * x6 in q6-q7 |
| | | 317 | * x7 in q14-q15 |
311 | */ | | 318 | */ |
312 | | | 319 | |
313 | vswp q1, q4 | | 320 | vswp q1, q4 |
| | | 321 | vswp q3, q6 |
314 | | | 322 | |
315 | vadd.u32 q0, q0, q9 | | 323 | vadd.u32 q0, q0, q9 |
316 | vadd.u32 q4, q4, q9 | | 324 | vadd.u32 q4, q4, q9 |
317 | vadd.u32 q2, q2, q9 | | 325 | vadd.u32 q2, q2, q9 |
318 | vadd.u32 q3, q3, q9 | | 326 | vadd.u32 q6, q6, q9 |
319 | | | 327 | |
320 | vadd.u32 q1, q1, q8 | | 328 | vadd.u32 q1, q1, q8 |
321 | vadd.u32 q5, q5, q8 | | 329 | vadd.u32 q5, q5, q8 |
322 | vadd.u32 q6, q6, q8 | | 330 | vadd.u32 q3, q3, q8 |
323 | vadd.u32 q7, q7, q8 | | 331 | vadd.u32 q7, q7, q8 |
324 | | | 332 | |
325 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ | | 333 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ |
326 | | | 334 | |
327 | LE32TOH(q0) | | 335 | LE32TOH(q0) |
328 | LE32TOH(q1) | | 336 | LE32TOH(q1) |
329 | LE32TOH(q2) | | 337 | LE32TOH(q2) |
330 | LE32TOH(q3) | | 338 | LE32TOH(q3) |
331 | LE32TOH(q4) | | 339 | LE32TOH(q4) |
332 | LE32TOH(q5) | | 340 | LE32TOH(q5) |
333 | LE32TOH(q6) | | 341 | LE32TOH(q6) |
334 | LE32TOH(q7) | | 342 | LE32TOH(q7) |
335 | | | 343 | |
336 | vst1.32 {q0-q1}, [r0]! | | 344 | vst1.32 {q0-q1}, [r0]! |
337 | vld1.32 {q0}, [r3] /* q0 := key[16:32) */ | | 345 | vld1.32 {q0}, [r3] /* q0 := key[16:32) */ |
338 | mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ | | 346 | mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ |
339 | vmov d2, r3, r6 | | 347 | vmov d2, r3, r6 |
340 | vmov d3, r8, r10 | | 348 | vmov d3, r8, r10 |
341 | | | 349 | |
342 | vzip.32 q8, q9 | | 350 | vzip.32 q8, q9 |
343 | vzip.32 q10, q11 | | 351 | vzip.32 q10, q11 |
344 | vzip.32 q12, q13 | | 352 | vzip.32 q12, q13 |
345 | vzip.32 q14, q15 | | 353 | vzip.32 q14, q15 |
346 | | | 354 | |
347 | vswp d17, d20 | | 355 | vswp d17, d20 |
348 | vswp d25, d28 | | 356 | vswp d25, d28 |
349 | vswp d19, d22 | | 357 | vswp d19, d22 |
350 | vswp d27, d30 | | 358 | vswp d27, d30 |
351 | | | 359 | |
| | | 360 | vswp q9, q12 |
| | | 361 | vswp q11, q14 |
| | | 362 | |
352 | vadd.u32 q8, q8, q0 | | 363 | vadd.u32 q8, q8, q0 |
353 | vadd.u32 q9, q9, q0 | | 364 | vadd.u32 q12, q12, q0 |
354 | vadd.u32 q10, q10, q0 | | 365 | vadd.u32 q10, q10, q0 |
355 | vadd.u32 q11, q11, q0 | | 366 | vadd.u32 q14, q14, q0 |
356 | | | 367 | |
357 | vadd.u32 q12, q12, q1 | | 368 | vadd.u32 q9, q9, q1 |
358 | vadd.u32 q13, q13, q1 | | 369 | vadd.u32 q13, q13, q1 |
359 | vadd.u32 q14, q14, q1 | | 370 | vadd.u32 q11, q11, q1 |
360 | vadd.u32 q15, q15, q1 | | 371 | vadd.u32 q15, q15, q1 |
361 | | | 372 | |
362 | LE32TOH(q8) | | 373 | LE32TOH(q8) |
363 | LE32TOH(q9) | | 374 | LE32TOH(q9) |
364 | LE32TOH(q10) | | 375 | LE32TOH(q10) |
365 | LE32TOH(q11) | | 376 | LE32TOH(q11) |
366 | LE32TOH(q12) | | 377 | LE32TOH(q12) |
367 | LE32TOH(q13) | | 378 | LE32TOH(q13) |
368 | LE32TOH(q14) | | 379 | LE32TOH(q14) |
369 | LE32TOH(q15) | | 380 | LE32TOH(q15) |
370 | | | 381 | |
371 | /* prepare to zero temporary space on stack */ | | 382 | /* vst1.32 {q0-q1}, [r0]! */ |
372 | vmov.i32 q0, #0 | | 383 | vst1.32 {q8-q9}, [r0]! |
373 | vmov.i32 q1, #0 | | 384 | vst1.32 {q2-q3}, [r0]! |
374 | | | 385 | vst1.32 {q10-q11}, [r0]! |
375 | /* vst1.32 {q0}, [r0]! */ | | 386 | vst1.32 {q4-q5}, [r0]! |
376 | /* vst1.32 {q1}, [r0]! */ /* (was q4 before vswp) */ | | 387 | vst1.32 {q12-q13}, [r0]! |
377 | vst1.32 {q8}, [r0]! | | 388 | vst1.32 {q6-q7}, [r0]! |
378 | vst1.32 {q12}, [r0]! | | 389 | vst1.32 {q14-q15}, [r0] |
379 | vst1.32 {q2}, [r0]! | | | |
380 | vst1.32 {q6}, [r0]! | | | |
381 | vst1.32 {q10}, [r0]! | | | |
382 | vst1.32 {q14}, [r0]! | | | |
383 | vst1.32 {q4}, [r0]! /* (was q1 before vswp) */ | | | |
384 | vst1.32 {q5}, [r0]! | | | |
385 | vst1.32 {q9}, [r0]! | | | |
386 | vst1.32 {q13}, [r0]! | | | |
387 | vst1.32 {q3}, [r0]! | | | |
388 | vst1.32 {q7}, [r0]! | | | |
389 | vst1.32 {q11}, [r0]! | | | |
390 | vst1.32 {q15}, [r0] | | | |
391 | | | 390 | |
392 | /* zero temporary space on the stack */ | | 391 | /* zero temporary space on the stack */ |
| | | 392 | vmov.i32 q0, #0 |
| | | 393 | vmov.i32 q1, #0 |
393 | vst1.8 {q0-q1}, [fp, :256] | | 394 | vst1.8 {q0-q1}, [fp, :256] |
394 | | | 395 | |
395 | /* restore callee-saves registers and stack */ | | 396 | /* restore callee-saves registers and stack */ |
396 | vpop {d8-d15} | | 397 | vpop {d8-d15} |
397 | pop {r4, r5, r6, r7, r8, r10, fp, lr} | | 398 | pop {r4, r5, r6, r7, r8, r10, fp, lr} |
398 | bx lr | | 399 | bx lr |
399 | END(chacha_stream256_neon) | | 400 | END(chacha_stream256_neon) |
400 | | | 401 | |
401 | /* | | 402 | /* |
402 | * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1, | | 403 | * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1, |
403 | * uint32_t blkno@r2, | | 404 | * uint32_t blkno@r2, |
404 | * const uint8_t nonce[12]@r3, | | 405 | * const uint8_t nonce[12]@r3, |
405 | * const uint8_t key[32]@sp[0], | | 406 | * const uint8_t key[32]@sp[0], |
406 | * const uint8_t const[16]@sp[4], | | 407 | * const uint8_t const[16]@sp[4], |
407 | * unsigned nr@sp[8]) | | 408 | * unsigned nr@sp[8]) |
408 | */ | | 409 | */ |
409 | ENTRY(chacha_stream_xor256_neon) | | 410 | ENTRY(chacha_stream_xor256_neon) |
410 | /* save callee-saves registers */ | | 411 | /* save callee-saves registers */ |
411 | push {r4, r5, r6, r7, r8, r10, fp, lr} | | 412 | push {r4, r5, r6, r7, r8, r10, fp, lr} |
412 | vpush {d8-d15} | | 413 | vpush {d8-d15} |
413 | | | 414 | |
414 | /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ | | 415 | /* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */ |
415 | ldr r7, .Lconstants_addr | | 416 | ldr r7, .Lconstants_addr |
416 | adr r6, .Lconstants_addr | | 417 | adr r6, .Lconstants_addr |
417 | | | 418 | |
418 | /* reserve space for two 128-bit/16-byte q registers */ | | 419 | /* reserve space for two 128-bit/16-byte q registers */ |
419 | sub fp, sp, #0x20 | | 420 | sub fp, sp, #0x20 |
420 | bic fp, fp, #0x1f /* align */ | | 421 | bic fp, fp, #0x1f /* align */ |
421 | | | 422 | |
422 | /* get parameters */ | | 423 | /* get parameters */ |
423 | add ip, sp, #96 | | 424 | add ip, sp, #96 |
424 | add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ | | 425 | add r7, r7, r6 /* r7 := .Lconstants (= v0123) */ |
425 | ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ | | 426 | ldm ip, {r4, r5, ip} /* r4 := key, r5 := const, ip := nr */ |
426 | ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ | | 427 | ldm r3, {r6, r8, r10} /* (r6, r8, r10) := nonce[0:12) */ |
427 | | | 428 | |
428 | vld1.32 {q12}, [r5] /* q12 := constant */ | | 429 | vld1.32 {q12}, [r5] /* q12 := constant */ |
429 | vld1.32 {q13-q14}, [r4] /* q13-q14 := key */ | | 430 | vld1.32 {q13-q14}, [r4] /* q13-q14 := key */ |
430 | vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ | | 431 | vld1.32 {q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */ |
431 | | | 432 | |
432 | vdup.32 q0, d24[0] /* q0-q3 := constant */ | | 433 | vdup.32 q0, d24[0] /* q0-q3 := constant */ |
433 | vdup.32 q1, d24[1] | | 434 | vdup.32 q1, d24[1] |
434 | vdup.32 q2, d25[0] | | 435 | vdup.32 q2, d25[0] |
435 | vdup.32 q3, d25[1] | | 436 | vdup.32 q3, d25[1] |
436 | vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */ | | 437 | vdup.32 q12, r2 /* q12 := (blkno, blkno, blkno, blkno) */ |
437 | vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ | | 438 | vdup.32 q4, d26[0] /* q4-q11 := (key, key, key, key) */ |
438 | vdup.32 q5, d26[1] | | 439 | vdup.32 q5, d26[1] |
439 | vdup.32 q6, d27[0] | | 440 | vdup.32 q6, d27[0] |
440 | vdup.32 q7, d27[1] | | 441 | vdup.32 q7, d27[1] |
441 | vdup.32 q8, d28[0] | | 442 | vdup.32 q8, d28[0] |
442 | vdup.32 q9, d28[1] | | 443 | vdup.32 q9, d28[1] |
443 | vdup.32 q10, d29[0] | | 444 | vdup.32 q10, d29[0] |
444 | vdup.32 q11, d29[1] | | 445 | vdup.32 q11, d29[1] |
445 | vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ | | 446 | vadd.u32 q12, q12, q15 /* q12 := (blkno,blkno+1,blkno+2,blkno+3) */ |
446 | vdup.32 q13, r6 /* q13-q15 := nonce */ | | 447 | vdup.32 q13, r6 /* q13-q15 := nonce */ |
447 | vdup.32 q14, r8 | | 448 | vdup.32 q14, r8 |
448 | vdup.32 q15, r10 | | 449 | vdup.32 q15, r10 |
449 | | | 450 | |
450 | HTOLE32(q0) | | 451 | HTOLE32(q0) |
451 | HTOLE32(q1) | | 452 | HTOLE32(q1) |
452 | HTOLE32(q2) | | 453 | HTOLE32(q2) |
453 | HTOLE32(q3) | | 454 | HTOLE32(q3) |
454 | HTOLE32(q4) | | 455 | HTOLE32(q4) |
455 | HTOLE32(q5) | | 456 | HTOLE32(q5) |
456 | HTOLE32(q6) | | 457 | HTOLE32(q6) |
457 | HTOLE32(q7) | | 458 | HTOLE32(q7) |
458 | HTOLE32(q8) | | 459 | HTOLE32(q8) |
459 | HTOLE32(q9) | | 460 | HTOLE32(q9) |
460 | HTOLE32(q10) | | 461 | HTOLE32(q10) |
461 | HTOLE32(q11) | | 462 | HTOLE32(q11) |
462 | HTOLE32(q12) | | 463 | HTOLE32(q12) |
463 | HTOLE32(q13) | | 464 | HTOLE32(q13) |
464 | HTOLE32(q14) | | 465 | HTOLE32(q14) |
465 | HTOLE32(q15) | | 466 | HTOLE32(q15) |
466 | | | 467 | |
467 | b 2f | | 468 | b 2f |
468 | | | 469 | |
469 | _ALIGN_TEXT | | 470 | _ALIGN_TEXT |
470 | 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 | | 471 | 1: ROUNDLD q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14 |
471 | 2: subs ip, ip, #2 | | 472 | 2: subs ip, ip, #2 |
472 | ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ | | 473 | ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ |
473 | d16, d24,d25, d26,d27, d28,d29, d30,d31 | | 474 | d16, d24,d25, d26,d27, d28,d29, d30,d31 |
474 | ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 | | 475 | ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 |
475 | ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ | | 476 | ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ |
476 | d20, d30,d31, d24,d25, d26,d27, d28,d29 | | 477 | d20, d30,d31, d24,d25, d26,d27, d28,d29 |
477 | bne 1b | | 478 | bne 1b |
478 | | | 479 | |
479 | /* | | 480 | /* |
480 | * q8-q9 are free / saved on the stack. Now for the real fun: | | 481 | * q8-q9 are free / saved on the stack. Now for the real fun: |
481 | * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in | | 482 | * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in |
482 | * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are | | 483 | * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are |
483 | * transposed from one another, and the x[i] are in general | | 484 | * transposed from one another, and the x[i] are in general |
484 | * registers and memory. So we have: | | 485 | * registers and memory. See comments in chacha_stream256_neon |
485 | * | | 486 | * for the layout with swaps. |
486 | * q0 = (x0[0], x1[0]; x2[0], x3[0]) | | | |
487 | * q1 = (x0[1], x1[1]; x2[1], x3[1]) | | | |
488 | * q2 = (x0[2], x1[2]; x2[2], x3[2]) | | | |
489 | * q3 = (x0[3], x1[3]; x2[3], x3[3]) | | | |
490 | * ... | | | |
491 | * q15 = (x0[15], x1[15]; x2[15], x3[15]) | | | |
492 | * | | | |
493 | * where xi[j] is the jth word of the ith 16-word block. Zip | | | |
494 | * consecutive pairs with vzip.32, and you get: | | | |
495 | * | | | |
496 | * q0 = (x0[0], x0[1]; x1[0], x1[1]) | | | |
497 | * q1 = (x2[0], x2[1]; x3[0], x3[1]) | | | |
498 | * q2 = (x0[2], x0[3]; x1[2], x1[3]) | | | |
499 | * q3 = (x2[2], x2[3]; x3[2], x3[3]) | | | |
500 | * ... | | | |
501 | * q15 = (x2[14], x2[15]; x3[14], x3[15]) | | | |
502 | * | | | |
503 | * As 64-bit d registers, this is: | | | |
504 | * | | | |
505 | * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) | | | |
506 | * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) | | | |
507 | * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) | | | |
508 | * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) | | | |
509 | * ... | | | |
510 | * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) | | | |
511 | * | | | |
512 | * Swap d1<->d4, d3<->d6, ..., and you get: | | | |
513 | * | | | |
514 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | | |
515 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | | |
516 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | | |
517 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | | |
518 | * ... | | | |
519 | * q15 = (x15[0], x15[1]; x15[2], x15[3]) | | | |
520 | */ | | 487 | */ |
521 | | | 488 | |
522 | sub r7, r7, #0x10 | | 489 | sub r7, r7, #0x10 |
523 | vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ | | 490 | vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ |
524 | vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ | | 491 | vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ |
525 | | | 492 | |
526 | vzip.32 q0, q1 | | 493 | vzip.32 q0, q1 |
527 | vzip.32 q2, q3 | | 494 | vzip.32 q2, q3 |
528 | vzip.32 q4, q5 | | 495 | vzip.32 q4, q5 |
529 | vzip.32 q6, q7 | | 496 | vzip.32 q6, q7 |
530 | | | 497 | |
531 | vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ | | 498 | vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ |
532 | vld1.32 {q9}, [r5] /* q9 := constant */ | | 499 | vld1.32 {q9}, [r5] /* q9 := constant */ |
533 | vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ | | 500 | vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ |
534 | vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ | | 501 | vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ |
535 | | | 502 | |
536 | vswp d1, d4 | | | |
537 | vswp d9, d12 | | | |
538 | vswp d3, d6 | | 503 | vswp d3, d6 |
| | | 504 | vswp d9, d12 |
| | | 505 | vswp d1, d4 |
539 | vswp d11, d14 | | 506 | vswp d11, d14 |
540 | | | 507 | |
541 | /* | | | |
542 | * At this point, the blocks are: | | | |
543 | * | | | |
544 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | | |
545 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | | |
546 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | | |
547 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | | |
548 | * q4 = (x0[4], x0[5]; x0[6], x0[7]) | | | |
549 | * q5 = (x2[4], x2[5]; x2[6], x2[7]) | | | |
550 | * q6 = (x1[4], x1[5]; x1[6], x1[7]) | | | |
551 | * q7 = (x3[4], x3[5]; x3[6], x3[7]) | | | |
552 | * | | | |
553 | * The first two rows to write out are q0 = x0[0:4) and q4 = | | | |
554 | * x0[4:8). If we first swap q1 and q4, then once we've | | | |
555 | * written them out we free up consecutive registers q0-q1 for | | | |
556 | * store-multiple. | | | |
557 | */ | | | |
558 | | | | |
559 | vswp q1, q4 | | 508 | vswp q1, q4 |
| | | 509 | vswp q3, q6 |
560 | | | 510 | |
561 | vadd.u32 q0, q0, q9 | | 511 | vadd.u32 q0, q0, q9 |
562 | vadd.u32 q4, q4, q9 | | 512 | vadd.u32 q4, q4, q9 |
563 | vadd.u32 q2, q2, q9 | | 513 | vadd.u32 q2, q2, q9 |
564 | vadd.u32 q3, q3, q9 | | 514 | vadd.u32 q6, q6, q9 |
565 | | | 515 | |
566 | vadd.u32 q1, q1, q8 | | 516 | vadd.u32 q1, q1, q8 |
567 | vadd.u32 q5, q5, q8 | | 517 | vadd.u32 q5, q5, q8 |
568 | vadd.u32 q6, q6, q8 | | 518 | vadd.u32 q3, q3, q8 |
569 | vadd.u32 q7, q7, q8 | | 519 | vadd.u32 q7, q7, q8 |
570 | | | 520 | |
571 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ | | 521 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ |
572 | | | 522 | |
573 | LE32TOH(q0) | | 523 | LE32TOH(q0) |
574 | LE32TOH(q1) | | 524 | LE32TOH(q1) |
575 | LE32TOH(q2) | | 525 | LE32TOH(q2) |
576 | LE32TOH(q6) | | 526 | LE32TOH(q6) |
577 | LE32TOH(q4) | | 527 | LE32TOH(q4) |
578 | LE32TOH(q5) | | 528 | LE32TOH(q5) |
579 | LE32TOH(q3) | | 529 | LE32TOH(q3) |
580 | LE32TOH(q7) | | 530 | LE32TOH(q7) |
581 | | | 531 | |
582 | veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ | | 532 | veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ |
583 | veor q1, q1, q9 | | 533 | veor q1, q1, q9 |
584 | | | 534 | |
585 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ | | 535 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ |
586 | | | 536 | |
587 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ | | 537 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ |
588 | vld1.32 {q0}, [r4] /* q0 := key[16:32) */ | | 538 | vld1.32 {q0}, [r4] /* q0 := key[16:32) */ |
589 | mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ | | 539 | mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ |
590 | vmov d2, r3, r6 | | 540 | vmov d2, r3, r6 |
591 | vmov d3, r8, r10 | | 541 | vmov d3, r8, r10 |
592 | | | 542 | |
593 | vzip.32 q8, q9 | | 543 | vzip.32 q8, q9 |
594 | vzip.32 q10, q11 | | 544 | vzip.32 q10, q11 |
595 | vzip.32 q12, q13 | | 545 | vzip.32 q12, q13 |
596 | vzip.32 q14, q15 | | 546 | vzip.32 q14, q15 |
597 | | | 547 | |
598 | vswp d17, d20 | | | |
599 | vswp d25, d28 | | | |
600 | vswp d19, d22 | | 548 | vswp d19, d22 |
| | | 549 | vswp d25, d28 |
| | | 550 | vswp d17, d20 |
601 | vswp d27, d30 | | 551 | vswp d27, d30 |
602 | | | 552 | |
603 | vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ | | 553 | vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ |
| | | 554 | vswp q11, q14 |
604 | | | 555 | |
605 | vadd.u32 q8, q8, q0 | | 556 | vadd.u32 q8, q8, q0 |
606 | vadd.u32 q12, q12, q0 | | 557 | vadd.u32 q12, q12, q0 |
607 | vadd.u32 q10, q10, q0 | | 558 | vadd.u32 q10, q10, q0 |
608 | vadd.u32 q11, q11, q0 | | 559 | vadd.u32 q14, q14, q0 |
609 | | | 560 | |
610 | vadd.u32 q9, q9, q1 | | 561 | vadd.u32 q9, q9, q1 |
611 | vadd.u32 q13, q13, q1 | | 562 | vadd.u32 q13, q13, q1 |
612 | vadd.u32 q14, q14, q1 | | 563 | vadd.u32 q11, q11, q1 |
613 | vadd.u32 q15, q15, q1 | | 564 | vadd.u32 q15, q15, q1 |
614 | | | 565 | |
615 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ | | 566 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ |
616 | | | 567 | |
617 | LE32TOH(q8) | | 568 | LE32TOH(q8) |
618 | LE32TOH(q9) | | 569 | LE32TOH(q9) |
619 | LE32TOH(q10) | | 570 | LE32TOH(q10) |
620 | LE32TOH(q14) | | 571 | LE32TOH(q11) |
621 | LE32TOH(q12) | | 572 | LE32TOH(q12) |
622 | LE32TOH(q13) | | 573 | LE32TOH(q13) |
623 | LE32TOH(q11) | | 574 | LE32TOH(q14) |
624 | LE32TOH(q15) | | 575 | LE32TOH(q15) |
625 | | | 576 | |
626 | veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ | | 577 | veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ |
627 | veor q1, q1, q9 | | 578 | veor q1, q1, q9 |
628 | | | 579 | |
629 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ | | 580 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ |
630 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ | | 581 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ |
631 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ | | 582 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ |
632 | | | 583 | |
633 | veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ | | 584 | veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ |
634 | veor q6, q6, q9 | | 585 | veor q3, q3, q9 |
635 | | | 586 | |
636 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ | | 587 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ |
637 | vst1.32 {q2}, [r0]! /* store ciphertext bytes [64:80) */ | | 588 | vst1.32 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ |
638 | | | 589 | |
639 | veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ | | 590 | veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ |
640 | veor q14, q14, q1 | | 591 | veor q11, q11, q1 |
641 | | | 592 | |
642 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ | | 593 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ |
643 | vst1.32 {q6}, [r0]! /* store ciphertext bytes [80:96) */ | | 594 | vst1.32 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ |
644 | | | 595 | |
645 | veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ | | 596 | veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ |
646 | veor q5, q5, q9 | | 597 | veor q5, q5, q9 |
647 | | | 598 | |
648 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ | | 599 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ |
649 | vst1.32 {q10}, [r0]! /* store ciphertext bytes [96:112) */ | | 600 | vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ |
650 | | | 601 | |
651 | veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ | | 602 | veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ |
652 | veor q13, q13, q1 | | 603 | veor q13, q13, q1 |
653 | | | 604 | |
654 | vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ | | 605 | vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ |
655 | vst1.32 {q14}, [r0]! /* store ciphertext bytes [112:128) */ | | 606 | vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ |
656 | | | 607 | |
657 | veor q8, q3, q8 /* compute ciphertext bytes [192:224) */ | | 608 | veor q6, q6, q8 /* compute ciphertext bytes [192:224) */ |
658 | veor q9, q7, q9 | | 609 | veor q7, q7, q9 |
659 | | | 610 | |
660 | vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [128:160) */ | | 611 | vst1.32 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ |
661 | vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [160:192) */ | | | |
662 | | | 612 | |
663 | veor q0, q11, q0 /* compute ciphertext bytes [224:256) */ | | 613 | veor q14, q14, q0 /* compute ciphertext bytes [224:256) */ |
664 | veor q1, q15, q1 | | 614 | veor q15, q15, q1 |
665 | | | 615 | |
666 | vst1.32 {q8-q9}, [r0]! /* store ciphertext bytes [192:224) */ | | 616 | vst1.32 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ |
667 | vst1.32 {q0-q1}, [r0] /* store ciphertext bytes [224:256) */ | | | |
668 | | | 617 | |
669 | /* zero temporary space on the stack */ | | 618 | /* zero temporary space on the stack */ |
670 | vmov.i32 q0, #0 | | 619 | vmov.i32 q0, #0 |
671 | vmov.i32 q1, #0 | | 620 | vmov.i32 q1, #0 |
672 | vst1.8 {q0-q1}, [fp, :256] | | 621 | vst1.8 {q0-q1}, [fp, :256] |
673 | | | 622 | |
674 | /* restore callee-saves registers and stack */ | | 623 | /* restore callee-saves registers and stack */ |
675 | vpop {d8-d15} | | 624 | vpop {d8-d15} |
676 | pop {r4, r5, r6, r7, r8, r10, fp, lr} | | 625 | pop {r4, r5, r6, r7, r8, r10, fp, lr} |
677 | bx lr | | 626 | bx lr |
678 | END(chacha_stream_xor256_neon) | | 627 | END(chacha_stream_xor256_neon) |
679 | | | 628 | |
680 | .section .rodata | | 629 | .section .rodata |
681 | .p2align 4 | | 630 | .p2align 4 |
682 | .Lconstants: | | 631 | .Lconstants: |
683 | | | 632 | |
684 | .type v0123,%object | | 633 | .type v0123,%object |
685 | v0123: | | 634 | v0123: |
686 | .long 0, 1, 2, 3 | | 635 | .long 0, 1, 2, 3 |
687 | END(v0123) | | 636 | END(v0123) |
688 | | | 637 | |
689 | .type rot8,%object | | 638 | .type rot8,%object |
690 | rot8: | | 639 | rot8: |
691 | .long 0x02010003, 0x06050407 | | 640 | .long 0x02010003, 0x06050407 |
692 | END(rot8) | | 641 | END(rot8) |