Sun Jul 19 07:32:43 2020 UTC ()
fix build with clang/llvm.

clang aarch64 assembler doesn't accept optional number of lanes of vector register.
(but ARMARM says that an assembler must accept it)


(ryo)
diff -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S

cvs diff -r1.4 -r1.5 src/sys/crypto/aes/arch/arm/aes_armv8_64.S (switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_armv8_64.S 2020/06/30 23:06:02 1.4
+++ src/sys/crypto/aes/arch/arm/aes_armv8_64.S 2020/07/19 07:32:43 1.5
@@ -1,1005 +1,1005 @@ @@ -1,1005 +1,1005 @@
1/* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */ 1/* $NetBSD: aes_armv8_64.S,v 1.5 2020/07/19 07:32:43 ryo Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#include <aarch64/asm.h> 29#include <aarch64/asm.h>
30 30
31 .arch_extension aes 31 .arch_extension aes
32 32
33/* 33/*
34 * uint32_t rcon[10] 34 * uint32_t rcon[10]
35 * 35 *
36 * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2). 36 * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
37 * Such elements of GF(8) need only eight bits to be represented, 37 * Such elements of GF(8) need only eight bits to be represented,
38 * but we store them in 4-byte units so we can copy one into all 38 * but we store them in 4-byte units so we can copy one into all
39 * four 4-byte lanes of a vector register with a single LD1R. The 39 * four 4-byte lanes of a vector register with a single LD1R. The
40 * access pattern is fixed, so indices into this table are never 40 * access pattern is fixed, so indices into this table are never
41 * secret. 41 * secret.
42 */ 42 */
43 .section .rodata 43 .section .rodata
44 .p2align 2 44 .p2align 2
45 .type rcon,@object 45 .type rcon,@object
46rcon: 46rcon:
47 .long 0x01 47 .long 0x01
48 .long 0x02 48 .long 0x02
49 .long 0x04 49 .long 0x04
50 .long 0x08 50 .long 0x08
51 .long 0x10 51 .long 0x10
52 .long 0x20 52 .long 0x20
53 .long 0x40 53 .long 0x40
54 .long 0x80 54 .long 0x80
55 .long 0x1b 55 .long 0x1b
56 .long 0x36 56 .long 0x36
57END(rcon) 57END(rcon)
58 58
59/* 59/*
60 * uint128_t unshiftrows_rotword_1 60 * uint128_t unshiftrows_rotword_1
61 * 61 *
62 * Table for TBL instruction to undo ShiftRows, and then do 62 * Table for TBL instruction to undo ShiftRows, and then do
63 * RotWord on word 1, and then copy it into all the other words. 63 * RotWord on word 1, and then copy it into all the other words.
64 */ 64 */
65 .section .rodata 65 .section .rodata
66 .p2align 4 66 .p2align 4
67 .type unshiftrows_rotword_1,@object 67 .type unshiftrows_rotword_1,@object
68unshiftrows_rotword_1: 68unshiftrows_rotword_1:
69 .byte 0x01,0x0e,0x0b,0x04 69 .byte 0x01,0x0e,0x0b,0x04
70 .byte 0x01,0x0e,0x0b,0x04 70 .byte 0x01,0x0e,0x0b,0x04
71 .byte 0x01,0x0e,0x0b,0x04 71 .byte 0x01,0x0e,0x0b,0x04
72 .byte 0x01,0x0e,0x0b,0x04 72 .byte 0x01,0x0e,0x0b,0x04
73END(unshiftrows_rotword_1) 73END(unshiftrows_rotword_1)
74 74
75/* 75/*
76 * uint128_t unshiftrows_3 76 * uint128_t unshiftrows_3
77 * 77 *
78 * Table for TBL instruction to undo ShiftRows, and then copy word 78 * Table for TBL instruction to undo ShiftRows, and then copy word
79 * 3 into all the other words. 79 * 3 into all the other words.
80 */ 80 */
81 .section .rodata 81 .section .rodata
82 .p2align 4 82 .p2align 4
83 .type unshiftrows_3,@object 83 .type unshiftrows_3,@object
84unshiftrows_3: 84unshiftrows_3:
85 .byte 0x0c,0x09,0x06,0x03 85 .byte 0x0c,0x09,0x06,0x03
86 .byte 0x0c,0x09,0x06,0x03 86 .byte 0x0c,0x09,0x06,0x03
87 .byte 0x0c,0x09,0x06,0x03 87 .byte 0x0c,0x09,0x06,0x03
88 .byte 0x0c,0x09,0x06,0x03 88 .byte 0x0c,0x09,0x06,0x03
89END(unshiftrows_3) 89END(unshiftrows_3)
90 90
91/* 91/*
92 * uint128_t unshiftrows_rotword_3 92 * uint128_t unshiftrows_rotword_3
93 * 93 *
94 * Table for TBL instruction to undo ShiftRows, and then do 94 * Table for TBL instruction to undo ShiftRows, and then do
95 * RotWord on word 3, and then copy it into all the other words. 95 * RotWord on word 3, and then copy it into all the other words.
96 */ 96 */
97 .section .rodata 97 .section .rodata
98 .p2align 4 98 .p2align 4
99 .type unshiftrows_rotword_3,@object 99 .type unshiftrows_rotword_3,@object
100unshiftrows_rotword_3: 100unshiftrows_rotword_3:
101 .byte 0x09,0x06,0x03,0x0c 101 .byte 0x09,0x06,0x03,0x0c
102 .byte 0x09,0x06,0x03,0x0c 102 .byte 0x09,0x06,0x03,0x0c
103 .byte 0x09,0x06,0x03,0x0c 103 .byte 0x09,0x06,0x03,0x0c
104 .byte 0x09,0x06,0x03,0x0c 104 .byte 0x09,0x06,0x03,0x0c
105END(unshiftrows_rotword_3) 105END(unshiftrows_rotword_3)
106 106
107/* 107/*
108 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1) 108 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
109 * 109 *
110 * Expand a 16-byte AES-128 key into 10 round keys. 110 * Expand a 16-byte AES-128 key into 10 round keys.
111 * 111 *
112 * Standard ABI calling convention. 112 * Standard ABI calling convention.
113 */ 113 */
114ENTRY(aesarmv8_setenckey128) 114ENTRY(aesarmv8_setenckey128)
115 ldr q1, [x1] /* q1 := master key */ 115 ldr q1, [x1] /* q1 := master key */
116 116
117 adrl x4, unshiftrows_rotword_3 117 adrl x4, unshiftrows_rotword_3
118 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ 118 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
119 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */ 119 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
120 120
121 str q1, [x0], #0x10 /* store master key as first round key */ 121 str q1, [x0], #0x10 /* store master key as first round key */
122 mov x2, #10 /* round count */ 122 mov x2, #10 /* round count */
123 adrl x3, rcon /* round constant */ 123 adrl x3, rcon /* round constant */
124 124
1251: /* 1251: /*
126 * q0 = 0 126 * q0 = 0
127 * v1.4s = (prk[0], prk[1], prk[2], prk[3]) 127 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
128 * x0 = pointer to round key to compute 128 * x0 = pointer to round key to compute
129 * x2 = round count 129 * x2 = round count
130 * x3 = rcon pointer 130 * x3 = rcon pointer
131 */ 131 */
132 132
133 /* q3 := ShiftRows(SubBytes(q1)) */ 133 /* q3 := ShiftRows(SubBytes(q1)) */
134 mov v3.16b, v1.16b 134 mov v3.16b, v1.16b
135 aese v3.16b, v0.16b 135 aese v3.16b, v0.16b
136 136
137 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ 137 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
138 ld1r {v4.4s}, [x3], #4 138 ld1r {v4.4s}, [x3], #4
139 tbl v3.16b, {v3.16b}, v16.16b 139 tbl v3.16b, {v3.16b}, v16.16b
140 eor v3.16b, v3.16b, v4.16b 140 eor v3.16b, v3.16b, v4.16b
141 141
142 /* 142 /*
143 * v5.4s := (0,prk[0],prk[1],prk[2]) 143 * v5.4s := (0,prk[0],prk[1],prk[2])
144 * v6.4s := (0,0,prk[0],prk[1]) 144 * v6.4s := (0,0,prk[0],prk[1])
145 * v7.4s := (0,0,0,prk[0]) 145 * v7.4s := (0,0,0,prk[0])
146 */ 146 */
147 ext v5.16b, v0.16b, v1.16b, #12 147 ext v5.16b, v0.16b, v1.16b, #12
148 ext v6.16b, v0.16b, v1.16b, #8 148 ext v6.16b, v0.16b, v1.16b, #8
149 ext v7.16b, v0.16b, v1.16b, #4 149 ext v7.16b, v0.16b, v1.16b, #4
150 150
151 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ 151 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
152 eor v1.16b, v1.16b, v3.16b 152 eor v1.16b, v1.16b, v3.16b
153 eor v1.16b, v1.16b, v5.16b 153 eor v1.16b, v1.16b, v5.16b
154 eor v1.16b, v1.16b, v6.16b 154 eor v1.16b, v1.16b, v6.16b
155 eor v1.16b, v1.16b, v7.16b 155 eor v1.16b, v1.16b, v7.16b
156 156
157 subs x2, x2, #1 /* count down rounds */ 157 subs x2, x2, #1 /* count down rounds */
158 str q1, [x0], #0x10 /* store round key */ 158 str q1, [x0], #0x10 /* store round key */
159 b.ne 1b 159 b.ne 1b
160 160
161 ret 161 ret
162END(aesarmv8_setenckey128) 162END(aesarmv8_setenckey128)
163 163
164/* 164/*
165 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1) 165 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
166 * 166 *
167 * Expand a 24-byte AES-192 key into 12 round keys. 167 * Expand a 24-byte AES-192 key into 12 round keys.
168 * 168 *
169 * Standard ABI calling convention. 169 * Standard ABI calling convention.
170 */ 170 */
171ENTRY(aesarmv8_setenckey192) 171ENTRY(aesarmv8_setenckey192)
172 ldr q1, [x1], #0x10 /* q1 := master key[0:128) */ 172 ldr q1, [x1], #0x10 /* q1 := master key[0:128) */
173 ldr d2, [x1] /* d2 := master key[128:192) */ 173 ldr d2, [x1] /* d2 := master key[128:192) */
174 174
175 adrl x4, unshiftrows_rotword_1 175 adrl x4, unshiftrows_rotword_1
176 adrl x5, unshiftrows_rotword_3 176 adrl x5, unshiftrows_rotword_3
177 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ 177 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
178 ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */ 178 ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
179 ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */ 179 ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
180 180
181 str q1, [x0], #0x10 /* store master key[0:128) as round key */ 181 str q1, [x0], #0x10 /* store master key[0:128) as round key */
182 mov x2, #12 /* round count */ 182 mov x2, #12 /* round count */
183 adrl x3, rcon /* round constant */ 183 adrl x3, rcon /* round constant */
184 184
1851: /* 1851: /*
186 * q0 = 0 186 * q0 = 0
187 * v1.4s = (prk[0], prk[1], prk[2], prk[3]) 187 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
188 * v2.4s = (rklo[0], rklo[1], xxx, xxx) 188 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
189 * x0 = pointer to three round keys to compute 189 * x0 = pointer to three round keys to compute
190 * x2 = round count 190 * x2 = round count
191 * x3 = rcon pointer 191 * x3 = rcon pointer
192 */ 192 */
193 193
194 /* q3 := ShiftRows(SubBytes(q2)) */ 194 /* q3 := ShiftRows(SubBytes(q2)) */
195 mov v3.16b, v2.16b 195 mov v3.16b, v2.16b
196 aese v3.16b, v0.16b 196 aese v3.16b, v0.16b
197 197
198 /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ 198 /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
199 ld1r {v4.4s}, [x3], #4 199 ld1r {v4.4s}, [x3], #4
200 tbl v3.16b, {v3.16b}, v16.16b 200 tbl v3.16b, {v3.16b}, v16.16b
201 eor v3.16b, v3.16b, v4.16b 201 eor v3.16b, v3.16b, v4.16b
202 202
203 /* 203 /*
204 * We need to compute: 204 * We need to compute:
205 * 205 *
206 * rk[0] := rklo[0] 206 * rk[0] := rklo[0]
207 * rk[1] := rklo[1] 207 * rk[1] := rklo[1]
208 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] 208 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
209 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] 209 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
210 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] 210 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
211 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] 211 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
212 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] 212 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
213 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] 213 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
214 * ^ rklo[1] 214 * ^ rklo[1]
215 */ 215 */
216 216
217 /* 217 /*
218 * v5.4s := (0,prk[0],prk[1],prk[2]) 218 * v5.4s := (0,prk[0],prk[1],prk[2])
219 * v6.4s := (0,0,prk[0],prk[1]) 219 * v6.4s := (0,0,prk[0],prk[1])
220 * v7.4s := (0,0,0,prk[0]) 220 * v7.4s := (0,0,0,prk[0])
221 */ 221 */
222 ext v5.16b, v0.16b, v1.16b, #12 222 ext v5.16b, v0.16b, v1.16b, #12
223 ext v6.16b, v0.16b, v1.16b, #8 223 ext v6.16b, v0.16b, v1.16b, #8
224 ext v7.16b, v0.16b, v1.16b, #4 224 ext v7.16b, v0.16b, v1.16b, #4
225 225
226 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */ 226 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
227 eor v5.16b, v5.16b, v1.16b 227 eor v5.16b, v5.16b, v1.16b
228 eor v5.16b, v5.16b, v3.16b 228 eor v5.16b, v5.16b, v3.16b
229 eor v5.16b, v5.16b, v6.16b 229 eor v5.16b, v5.16b, v6.16b
230 eor v5.16b, v5.16b, v7.16b 230 eor v5.16b, v5.16b, v7.16b
231 231
232 /* 232 /*
233 * At this point, rk is split across v2.4s = (rk[0],rk[1],...) 233 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
234 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s = 234 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
235 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or 235 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
236 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s = 236 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
237 * (rklo[0],rklo[1],...). 237 * (rklo[0],rklo[1],...).
238 */ 238 */
239 239
240 /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */ 240 /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
241 dup v1.4s, v5.4s[3] 241 dup v1.4s, v5.s[3]
242 mov v1.4s[0], v5.4s[2] 242 mov v1.s[0], v5.s[2]
243 243
244 /* 244 /*
245 * v6.4s := (0, 0, rklo[0], rklo[1]) 245 * v6.4s := (0, 0, rklo[0], rklo[1])
246 * v7.4s := (0, 0, 0, rklo[0]) 246 * v7.4s := (0, 0, 0, rklo[0])
247 */ 247 */
248 ext v6.16b, v0.16b, v2.16b, #8 248 ext v6.16b, v0.16b, v2.16b, #8
249 ext v7.16b, v0.16b, v2.16b, #4 249 ext v7.16b, v0.16b, v2.16b, #4
250 250
251 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ 251 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
252 eor v3.16b, v1.16b, v6.16b 252 eor v3.16b, v1.16b, v6.16b
253 eor v3.16b, v3.16b, v7.16b 253 eor v3.16b, v3.16b, v7.16b
254 254
255 /* 255 /*
256 * Recall v2.4s = (rk[0], rk[1], xxx, xxx) 256 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
257 * and v5.4s = (rk[2], rk[3], xxx, xxx). Set 257 * and v5.4s = (rk[2], rk[3], xxx, xxx). Set
258 * v2.4s := (rk[0], rk[1], rk[2], rk[3]) 258 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
259 */ 259 */
260 mov v2.2d[1], v5.2d[0] 260 mov v2.d[1], v5.d[0]
261 261
262 /* store two round keys */ 262 /* store two round keys */
263 stp q2, q3, [x0], #0x20 263 stp q2, q3, [x0], #0x20
264 264
265 /* 265 /*
266 * Live vector registers at this point: 266 * Live vector registers at this point:
267 * 267 *
268 * q0 = zero 268 * q0 = zero
269 * q2 = rk 269 * q2 = rk
270 * q3 = nrk 270 * q3 = nrk
271 * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) 271 * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
272 * q16 = unshiftrows_rotword_1 272 * q16 = unshiftrows_rotword_1
273 * q17 = unshiftrows_rotword_3 273 * q17 = unshiftrows_rotword_3
274 * 274 *
275 * We have to compute, in q1: 275 * We have to compute, in q1:
276 * 276 *
277 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] 277 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
278 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] 278 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
279 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 279 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
280 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 280 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
281 * ^ nrk[1] 281 * ^ nrk[1]
282 * 282 *
283 * And, if there's any more afterward, in q2: 283 * And, if there's any more afterward, in q2:
284 * 284 *
285 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 285 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
286 * ^ nrk[1] ^ nrk[2] 286 * ^ nrk[1] ^ nrk[2]
287 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] 287 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
288 * ^ nrk[1] ^ nrk[2] ^ nrk[3] 288 * ^ nrk[1] ^ nrk[2] ^ nrk[3]
289 */ 289 */
290 290
291 /* q1 := RotWords(SubBytes(q3)) */ 291 /* q1 := RotWords(SubBytes(q3)) */
292 mov v1.16b, v3.16b 292 mov v1.16b, v3.16b
293 aese v1.16b, v0.16b 293 aese v1.16b, v0.16b
294 294
295 /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ 295 /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
296 ld1r {v4.4s}, [x3], #4 296 ld1r {v4.4s}, [x3], #4
297 tbl v1.16b, {v1.16b}, v17.16b 297 tbl v1.16b, {v1.16b}, v17.16b
298 eor v1.16b, v1.16b, v4.16b 298 eor v1.16b, v1.16b, v4.16b
299 299
300 /* 300 /*
301 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already] 301 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
302 * v4.4s := (0, rk[2], rk[3], nrk[0]) 302 * v4.4s := (0, rk[2], rk[3], nrk[0])
303 * v6.4s := (0, 0, rk[2], rk[3]) 303 * v6.4s := (0, 0, rk[2], rk[3])
304 * v7.4s := (0, 0, 0, rk[2]) 304 * v7.4s := (0, 0, 0, rk[2])
305 */ 305 */
306 ext v4.16b, v0.16b, v5.16b, #12 306 ext v4.16b, v0.16b, v5.16b, #12
307 ext v6.16b, v0.16b, v5.16b, #8 307 ext v6.16b, v0.16b, v5.16b, #8
308 ext v7.16b, v0.16b, v5.16b, #4 308 ext v7.16b, v0.16b, v5.16b, #4
309 309
310 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */ 310 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
311 eor v1.16b, v1.16b, v5.16b 311 eor v1.16b, v1.16b, v5.16b
312 eor v1.16b, v1.16b, v4.16b 312 eor v1.16b, v1.16b, v4.16b
313 eor v1.16b, v1.16b, v6.16b 313 eor v1.16b, v1.16b, v6.16b
314 eor v1.16b, v1.16b, v7.16b 314 eor v1.16b, v1.16b, v7.16b
315 315
316 subs x2, x2, #3 /* count down three rounds */ 316 subs x2, x2, #3 /* count down three rounds */
317 str q1, [x0], #0x10 /* store third round key */ 317 str q1, [x0], #0x10 /* store third round key */
318 b.eq 2f 318 b.eq 2f
319 319
320 /* 320 /*
321 * v4.4s := (nrk[2], nrk[3], xxx, xxx) 321 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
322 * v5.4s := (0, nrk[2], xxx, xxx) 322 * v5.4s := (0, nrk[2], xxx, xxx)
323 */ 323 */
324 ext v4.16b, v3.16b, v0.16b, #8 324 ext v4.16b, v3.16b, v0.16b, #8
325 ext v5.16b, v0.16b, v4.16b, #12 325 ext v5.16b, v0.16b, v4.16b, #12
326 326
327 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */ 327 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
328 dup v2.4s, v1.4s[3] 328 dup v2.4s, v1.s[3]
329 329
330 /* 330 /*
331 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2], 331 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
332 * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3], 332 * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
333 * xxx, xxx) 333 * xxx, xxx)
334 */ 334 */
335 eor v2.16b, v2.16b, v4.16b 335 eor v2.16b, v2.16b, v4.16b
336 eor v2.16b, v2.16b, v5.16b 336 eor v2.16b, v2.16b, v5.16b
337 337
338 b 1b 338 b 1b
339 339
3402: ret 3402: ret
341END(aesarmv8_setenckey192) 341END(aesarmv8_setenckey192)
342 342
343/* 343/*
344 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1) 344 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
345 * 345 *
346 * Expand a 32-byte AES-256 key into 14 round keys. 346 * Expand a 32-byte AES-256 key into 14 round keys.
347 * 347 *
348 * Standard ABI calling convention. 348 * Standard ABI calling convention.
349 */ 349 */
350ENTRY(aesarmv8_setenckey256) 350ENTRY(aesarmv8_setenckey256)
351 /* q1 := key[0:128), q2 := key[128:256) */ 351 /* q1 := key[0:128), q2 := key[128:256) */
352 ldp q1, q2, [x1], #0x20 352 ldp q1, q2, [x1], #0x20
353 353
354 adrl x4, unshiftrows_rotword_3 354 adrl x4, unshiftrows_rotword_3
355 adrl x5, unshiftrows_3 355 adrl x5, unshiftrows_3
356 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ 356 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
357 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */ 357 ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
358 ldr q17, [x5] /* q17 := unshiftrows_3 */ 358 ldr q17, [x5] /* q17 := unshiftrows_3 */
359 359
360 /* store master key as first two round keys */ 360 /* store master key as first two round keys */
361 stp q1, q2, [x0], #0x20 361 stp q1, q2, [x0], #0x20
362 mov x2, #14 /* round count */ 362 mov x2, #14 /* round count */
363 adrl x3, rcon /* round constant */ 363 adrl x3, rcon /* round constant */
364 364
3651: /* 3651: /*
366 * q0 = 0 366 * q0 = 0
367 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3]) 367 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
368 * v2.4s = (prk[0], prk[1], prk[2], prk[3]) 368 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
369 * x2 = round count 369 * x2 = round count
370 * x3 = rcon pointer 370 * x3 = rcon pointer
371 */ 371 */
372 372
373 /* q3 := ShiftRows(SubBytes(q2)) */ 373 /* q3 := ShiftRows(SubBytes(q2)) */
374 mov v3.16b, v2.16b 374 mov v3.16b, v2.16b
375 aese v3.16b, v0.16b 375 aese v3.16b, v0.16b
376 376
377 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ 377 /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
378 ld1r {v4.4s}, [x3], #4 378 ld1r {v4.4s}, [x3], #4
379 tbl v3.16b, {v3.16b}, v16.16b 379 tbl v3.16b, {v3.16b}, v16.16b
380 eor v3.16b, v3.16b, v4.16b 380 eor v3.16b, v3.16b, v4.16b
381 381
382 /* 382 /*
383 * v5.4s := (0,pprk[0],pprk[1],pprk[2]) 383 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
384 * v6.4s := (0,0,pprk[0],pprk[1]) 384 * v6.4s := (0,0,pprk[0],pprk[1])
385 * v7.4s := (0,0,0,pprk[0]) 385 * v7.4s := (0,0,0,pprk[0])
386 */ 386 */
387 ext v5.16b, v0.16b, v1.16b, #12 387 ext v5.16b, v0.16b, v1.16b, #12
388 ext v6.16b, v0.16b, v1.16b, #8 388 ext v6.16b, v0.16b, v1.16b, #8
389 ext v7.16b, v0.16b, v1.16b, #4 389 ext v7.16b, v0.16b, v1.16b, #4
390 390
391 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ 391 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
392 eor v1.16b, v1.16b, v3.16b 392 eor v1.16b, v1.16b, v3.16b
393 eor v1.16b, v1.16b, v5.16b 393 eor v1.16b, v1.16b, v5.16b
394 eor v1.16b, v1.16b, v6.16b 394 eor v1.16b, v1.16b, v6.16b
395 eor v1.16b, v1.16b, v7.16b 395 eor v1.16b, v1.16b, v7.16b
396 396
397 subs x2, x2, #2 /* count down two rounds */ 397 subs x2, x2, #2 /* count down two rounds */
398 b.eq 2f /* stop if this is the last one */ 398 b.eq 2f /* stop if this is the last one */
399 399
400 /* q3 := ShiftRows(SubBytes(q1)) */ 400 /* q3 := ShiftRows(SubBytes(q1)) */
401 mov v3.16b, v1.16b 401 mov v3.16b, v1.16b
402 aese v3.16b, v0.16b 402 aese v3.16b, v0.16b
403 403
404 /* v3.4s[i] := SubBytes(rk[3]) */ 404 /* v3.4s[i] := SubBytes(rk[3]) */
405 tbl v3.16b, {v3.16b}, v17.16b 405 tbl v3.16b, {v3.16b}, v17.16b
406 406
407 /* 407 /*
408 * v5.4s := (0,prk[0],prk[1],prk[2]) 408 * v5.4s := (0,prk[0],prk[1],prk[2])
409 * v6.4s := (0,0,prk[0],prk[1]) 409 * v6.4s := (0,0,prk[0],prk[1])
410 * v7.4s := (0,0,0,prk[0]) 410 * v7.4s := (0,0,0,prk[0])
411 */ 411 */
412 ext v5.16b, v0.16b, v2.16b, #12 412 ext v5.16b, v0.16b, v2.16b, #12
413 ext v6.16b, v0.16b, v2.16b, #8 413 ext v6.16b, v0.16b, v2.16b, #8
414 ext v7.16b, v0.16b, v2.16b, #4 414 ext v7.16b, v0.16b, v2.16b, #4
415 415
416 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ 416 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
417 eor v2.16b, v2.16b, v3.16b 417 eor v2.16b, v2.16b, v3.16b
418 eor v2.16b, v2.16b, v5.16b 418 eor v2.16b, v2.16b, v5.16b
419 eor v2.16b, v2.16b, v6.16b 419 eor v2.16b, v2.16b, v6.16b
420 eor v2.16b, v2.16b, v7.16b 420 eor v2.16b, v2.16b, v7.16b
421 421
422 stp q1, q2, [x0], #0x20 /* store two round keys */ 422 stp q1, q2, [x0], #0x20 /* store two round keys */
423 b 1b 423 b 1b
424 424
4252: str q1, [x0] /* store last round key */ 4252: str q1, [x0] /* store last round key */
426 ret 426 ret
427END(aesarmv8_setenckey256) 427END(aesarmv8_setenckey256)
428 428
429/* 429/*
430 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1, 430 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
431 * uint32_t nrounds@x2) 431 * uint32_t nrounds@x2)
432 * 432 *
433 * Convert AES encryption round keys to AES decryption round keys. 433 * Convert AES encryption round keys to AES decryption round keys.
434 * `rounds' must be between 10 and 14. 434 * `rounds' must be between 10 and 14.
435 * 435 *
436 * Standard ABI calling convention. 436 * Standard ABI calling convention.
437 */ 437 */
438ENTRY(aesarmv8_enctodec) 438ENTRY(aesarmv8_enctodec)
439 ldr q0, [x0, x2, lsl #4] /* load last round key */ 439 ldr q0, [x0, x2, lsl #4] /* load last round key */
4401: str q0, [x1], #0x10 /* store round key */ 4401: str q0, [x1], #0x10 /* store round key */
441 subs x2, x2, #1 /* count down round */ 441 subs x2, x2, #1 /* count down round */
442 ldr q0, [x0, x2, lsl #4] /* load previous round key */ 442 ldr q0, [x0, x2, lsl #4] /* load previous round key */
443 b.eq 2f /* stop if this is the last one */ 443 b.eq 2f /* stop if this is the last one */
444 aesimc v0.16b, v0.16b /* convert encryption to decryption */ 444 aesimc v0.16b, v0.16b /* convert encryption to decryption */
445 b 1b 445 b 1b
4462: str q0, [x1] /* store first round key verbatim */ 4462: str q0, [x1] /* store first round key verbatim */
447 ret 447 ret
448END(aesarmv8_enctodec) 448END(aesarmv8_enctodec)
449 449
450/* 450/*
451 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1, 451 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
452 * uint8_t out[16] @x2, uint32_t nrounds@x3) 452 * uint8_t out[16] @x2, uint32_t nrounds@x3)
453 * 453 *
454 * Encrypt a single block. 454 * Encrypt a single block.
455 * 455 *
456 * Standard ABI calling convention. 456 * Standard ABI calling convention.
457 */ 457 */
458ENTRY(aesarmv8_enc) 458ENTRY(aesarmv8_enc)
459 stp fp, lr, [sp, #-16]! /* push stack frame */ 459 stp fp, lr, [sp, #-16]! /* push stack frame */
460 mov fp, sp 460 mov fp, sp
461 ldr q0, [x1] /* q0 := ptxt */ 461 ldr q0, [x1] /* q0 := ptxt */
462 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ 462 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
463 str q0, [x2] /* store ctxt */ 463 str q0, [x2] /* store ctxt */
464 ldp fp, lr, [sp], #16 /* pop stack frame */ 464 ldp fp, lr, [sp], #16 /* pop stack frame */
465 ret 465 ret
466END(aesarmv8_enc) 466END(aesarmv8_enc)
467 467
468/* 468/*
469 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1, 469 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
470 * uint8_t out[16] @x2, uint32_t nrounds@x3) 470 * uint8_t out[16] @x2, uint32_t nrounds@x3)
471 * 471 *
472 * Decrypt a single block. 472 * Decrypt a single block.
473 * 473 *
474 * Standard ABI calling convention. 474 * Standard ABI calling convention.
475 */ 475 */
476ENTRY(aesarmv8_dec) 476ENTRY(aesarmv8_dec)
477 stp fp, lr, [sp, #-16]! /* push stack frame */ 477 stp fp, lr, [sp, #-16]! /* push stack frame */
478 mov fp, sp 478 mov fp, sp
479 ldr q0, [x1] /* q0 := ctxt */ 479 ldr q0, [x1] /* q0 := ctxt */
480 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ 480 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
481 str q0, [x2] /* store ptxt */ 481 str q0, [x2] /* store ptxt */
482 ldp fp, lr, [sp], #16 /* pop stack frame */ 482 ldp fp, lr, [sp], #16 /* pop stack frame */
483 ret 483 ret
484END(aesarmv8_dec) 484END(aesarmv8_dec)
485 485
486/* 486/*
487 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1, 487 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
488 * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4, 488 * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
489 * uint32_t nrounds@x5) 489 * uint32_t nrounds@x5)
490 * 490 *
491 * Encrypt a contiguous sequence of blocks with AES-CBC. 491 * Encrypt a contiguous sequence of blocks with AES-CBC.
492 * 492 *
493 * nbytes must be an integral multiple of 16. 493 * nbytes must be an integral multiple of 16.
494 * 494 *
495 * Standard ABI calling convention. 495 * Standard ABI calling convention.
496 */ 496 */
497ENTRY(aesarmv8_cbc_enc) 497ENTRY(aesarmv8_cbc_enc)
498 cbz x3, 2f /* stop if nothing to do */ 498 cbz x3, 2f /* stop if nothing to do */
499 stp fp, lr, [sp, #-16]! /* push stack frame */ 499 stp fp, lr, [sp, #-16]! /* push stack frame */
500 mov fp, sp 500 mov fp, sp
501 mov x9, x0 /* x9 := enckey */ 501 mov x9, x0 /* x9 := enckey */
502 mov x10, x3 /* x10 := nbytes */ 502 mov x10, x3 /* x10 := nbytes */
503 ldr q0, [x4] /* q0 := chaining value */ 503 ldr q0, [x4] /* q0 := chaining value */
5041: ldr q1, [x1], #0x10 /* q1 := plaintext block */ 5041: ldr q1, [x1], #0x10 /* q1 := plaintext block */
505 eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ 505 eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
506 mov x0, x9 /* x0 := enckey */ 506 mov x0, x9 /* x0 := enckey */
507 mov x3, x5 /* x3 := nrounds */ 507 mov x3, x5 /* x3 := nrounds */
508 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ 508 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
509 subs x10, x10, #0x10 /* count down nbytes */ 509 subs x10, x10, #0x10 /* count down nbytes */
510 str q0, [x2], #0x10 /* store ciphertext block */ 510 str q0, [x2], #0x10 /* store ciphertext block */
511 b.ne 1b /* repeat if x10 is nonzero */ 511 b.ne 1b /* repeat if x10 is nonzero */
512 str q0, [x4] /* store chaining value */ 512 str q0, [x4] /* store chaining value */
513 ldp fp, lr, [sp], #16 /* pop stack frame */ 513 ldp fp, lr, [sp], #16 /* pop stack frame */
5142: ret 5142: ret
515END(aesarmv8_cbc_enc) 515END(aesarmv8_cbc_enc)
516 516
517/* 517/*
518 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, 518 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
519 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, 519 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
520 * uint32_t nrounds@x5) 520 * uint32_t nrounds@x5)
521 * 521 *
522 * Decrypt a contiguous sequence of blocks with AES-CBC. 522 * Decrypt a contiguous sequence of blocks with AES-CBC.
523 * 523 *
524 * nbytes must be a positive integral multiple of 16. This routine 524 * nbytes must be a positive integral multiple of 16. This routine
525 * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once. 525 * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
526 * 526 *
527 * Standard ABI calling convention. 527 * Standard ABI calling convention.
528 */ 528 */
529ENTRY(aesarmv8_cbc_dec1) 529ENTRY(aesarmv8_cbc_dec1)
530 stp fp, lr, [sp, #-16]! /* push stack frame */ 530 stp fp, lr, [sp, #-16]! /* push stack frame */
531 mov fp, sp 531 mov fp, sp
532 ldr q24, [x4] /* q24 := iv */ 532 ldr q24, [x4] /* q24 := iv */
533 mov x9, x0 /* x9 := enckey */ 533 mov x9, x0 /* x9 := enckey */
534 mov x10, x3 /* x10 := nbytes */ 534 mov x10, x3 /* x10 := nbytes */
535 add x1, x1, x3 /* x1 := pointer past end of in */ 535 add x1, x1, x3 /* x1 := pointer past end of in */
536 add x2, x2, x3 /* x2 := pointer past end of out */ 536 add x2, x2, x3 /* x2 := pointer past end of out */
537 ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */ 537 ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
538 str q0, [x4] /* update iv */ 538 str q0, [x4] /* update iv */
5391: mov x0, x9 /* x0 := enckey */ 5391: mov x0, x9 /* x0 := enckey */
540 mov x3, x5 /* x3 := nrounds */ 540 mov x3, x5 /* x3 := nrounds */
541 bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ 541 bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
542 subs x10, x10, #0x10 /* count down nbytes */ 542 subs x10, x10, #0x10 /* count down nbytes */
543 b.eq 2f /* stop if this is the first block */ 543 b.eq 2f /* stop if this is the first block */
544 ldr q31, [x1, #-0x10]! /* q31 := chaining value */ 544 ldr q31, [x1, #-0x10]! /* q31 := chaining value */
545 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ 545 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
546 str q0, [x2, #-0x10]! /* store plaintext block */ 546 str q0, [x2, #-0x10]! /* store plaintext block */
547 mov v0.16b, v31.16b /* move cv = ciphertext block */ 547 mov v0.16b, v31.16b /* move cv = ciphertext block */
548 b 1b 548 b 1b
5492: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ 5492: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
550 str q0, [x2, #-0x10]! /* store first plaintext block */ 550 str q0, [x2, #-0x10]! /* store first plaintext block */
551 ldp fp, lr, [sp], #16 /* pop stack frame */ 551 ldp fp, lr, [sp], #16 /* pop stack frame */
552 ret 552 ret
553END(aesarmv8_cbc_dec1) 553END(aesarmv8_cbc_dec1)
554 554
555/* 555/*
556 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, 556 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
557 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, 557 * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
558 * uint32_t nrounds@x5) 558 * uint32_t nrounds@x5)
559 * 559 *
560 * Decrypt a contiguous sequence of 8-block units with AES-CBC. 560 * Decrypt a contiguous sequence of 8-block units with AES-CBC.
561 * 561 *
562 * nbytes must be a positive integral multiple of 128. 562 * nbytes must be a positive integral multiple of 128.
563 * 563 *
564 * Standard ABI calling convention. 564 * Standard ABI calling convention.
565 */ 565 */
566ENTRY(aesarmv8_cbc_dec8) 566ENTRY(aesarmv8_cbc_dec8)
567 stp fp, lr, [sp, #-16]! /* push stack frame */ 567 stp fp, lr, [sp, #-16]! /* push stack frame */
568 mov fp, sp 568 mov fp, sp
569 ldr q24, [x4] /* q24 := iv */ 569 ldr q24, [x4] /* q24 := iv */
570 mov x9, x0 /* x9 := enckey */ 570 mov x9, x0 /* x9 := enckey */
571 mov x10, x3 /* x10 := nbytes */ 571 mov x10, x3 /* x10 := nbytes */
572 add x1, x1, x3 /* x1 := pointer past end of in */ 572 add x1, x1, x3 /* x1 := pointer past end of in */
573 add x2, x2, x3 /* x2 := pointer past end of out */ 573 add x2, x2, x3 /* x2 := pointer past end of out */
574 ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */ 574 ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
575 str q7, [x4] /* update iv */ 575 str q7, [x4] /* update iv */
5761: ldp q4, q5, [x1, #-0x20]! 5761: ldp q4, q5, [x1, #-0x20]!
577 ldp q2, q3, [x1, #-0x20]! 577 ldp q2, q3, [x1, #-0x20]!
578 ldp q0, q1, [x1, #-0x20]! 578 ldp q0, q1, [x1, #-0x20]!
579 mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */ 579 mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
580 mov v30.16b, v5.16b 580 mov v30.16b, v5.16b
581 mov v29.16b, v4.16b 581 mov v29.16b, v4.16b
582 mov v28.16b, v3.16b 582 mov v28.16b, v3.16b
583 mov v27.16b, v2.16b 583 mov v27.16b, v2.16b
584 mov v26.16b, v1.16b 584 mov v26.16b, v1.16b
585 mov v25.16b, v0.16b 585 mov v25.16b, v0.16b
586 mov x0, x9 /* x0 := enckey */ 586 mov x0, x9 /* x0 := enckey */
587 mov x3, x5 /* x3 := nrounds */ 587 mov x3, x5 /* x3 := nrounds */
588 bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i]; 588 bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
589 * trash x0/x3/q16 */ 589 * trash x0/x3/q16 */
590 eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */ 590 eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
591 eor v6.16b, v6.16b, v30.16b 591 eor v6.16b, v6.16b, v30.16b
592 eor v5.16b, v5.16b, v29.16b 592 eor v5.16b, v5.16b, v29.16b
593 eor v4.16b, v4.16b, v28.16b 593 eor v4.16b, v4.16b, v28.16b
594 eor v3.16b, v3.16b, v27.16b 594 eor v3.16b, v3.16b, v27.16b
595 eor v2.16b, v2.16b, v26.16b 595 eor v2.16b, v2.16b, v26.16b
596 eor v1.16b, v1.16b, v25.16b 596 eor v1.16b, v1.16b, v25.16b
597 subs x10, x10, #0x80 /* count down nbytes */ 597 subs x10, x10, #0x80 /* count down nbytes */
598 stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */ 598 stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
599 stp q4, q5, [x2, #-0x20]! 599 stp q4, q5, [x2, #-0x20]!
600 stp q2, q3, [x2, #-0x20]! 600 stp q2, q3, [x2, #-0x20]!
601 b.eq 2f /* stop if this is the first block */ 601 b.eq 2f /* stop if this is the first block */
602 ldp q6, q7, [x1, #-0x20]! 602 ldp q6, q7, [x1, #-0x20]!
603 eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ 603 eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
604 stp q0, q1, [x2, #-0x20]! 604 stp q0, q1, [x2, #-0x20]!
605 b 1b 605 b 1b
6062: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */ 6062: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
607 stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */ 607 stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
608 ldp fp, lr, [sp], #16 /* pop stack frame */ 608 ldp fp, lr, [sp], #16 /* pop stack frame */
609 ret 609 ret
610END(aesarmv8_cbc_dec8) 610END(aesarmv8_cbc_dec8)
611 611
612/* 612/*
613 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1, 613 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
614 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 614 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
615 * uint32_t nrounds@x5) 615 * uint32_t nrounds@x5)
616 * 616 *
617 * Encrypt a contiguous sequence of blocks with AES-XTS. 617 * Encrypt a contiguous sequence of blocks with AES-XTS.
618 * 618 *
619 * nbytes must be a positive integral multiple of 16. This routine 619 * nbytes must be a positive integral multiple of 16. This routine
620 * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once. 620 * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
621 * 621 *
622 * Standard ABI calling convention. 622 * Standard ABI calling convention.
623 */ 623 */
624ENTRY(aesarmv8_xts_enc1) 624ENTRY(aesarmv8_xts_enc1)
625 stp fp, lr, [sp, #-16]! /* push stack frame */ 625 stp fp, lr, [sp, #-16]! /* push stack frame */
626 mov fp, sp 626 mov fp, sp
627 mov x9, x0 /* x9 := enckey */ 627 mov x9, x0 /* x9 := enckey */
628 mov x10, x3 /* x10 := nbytes */ 628 mov x10, x3 /* x10 := nbytes */
629 ldr q31, [x4] /* q31 := tweak */ 629 ldr q31, [x4] /* q31 := tweak */
6301: ldr q0, [x1], #0x10 /* q0 := ptxt */ 6301: ldr q0, [x1], #0x10 /* q0 := ptxt */
631 mov x0, x9 /* x0 := enckey */ 631 mov x0, x9 /* x0 := enckey */
632 mov x3, x5 /* x3 := nrounds */ 632 mov x3, x5 /* x3 := nrounds */
633 eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */ 633 eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
634 bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */ 634 bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
635 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ 635 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
636 str q0, [x2], #0x10 /* store ciphertext block */ 636 str q0, [x2], #0x10 /* store ciphertext block */
637 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 637 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
638 subs x10, x10, #0x10 /* count down nbytes */ 638 subs x10, x10, #0x10 /* count down nbytes */
639 b.ne 1b /* repeat if more blocks */ 639 b.ne 1b /* repeat if more blocks */
640 str q31, [x4] /* update tweak */ 640 str q31, [x4] /* update tweak */
641 ldp fp, lr, [sp], #16 /* pop stack frame */ 641 ldp fp, lr, [sp], #16 /* pop stack frame */
642 ret 642 ret
643END(aesarmv8_xts_enc1) 643END(aesarmv8_xts_enc1)
644 644
645/* 645/*
646 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1, 646 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
647 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 647 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
648 * uint32_t nrounds@x5) 648 * uint32_t nrounds@x5)
649 * 649 *
650 * Encrypt a contiguous sequence of blocks with AES-XTS. 650 * Encrypt a contiguous sequence of blocks with AES-XTS.
651 * 651 *
652 * nbytes must be a positive integral multiple of 128. 652 * nbytes must be a positive integral multiple of 128.
653 * 653 *
654 * Standard ABI calling convention. 654 * Standard ABI calling convention.
655 */ 655 */
656ENTRY(aesarmv8_xts_enc8) 656ENTRY(aesarmv8_xts_enc8)
657 stp fp, lr, [sp, #-16]! /* push stack frame */ 657 stp fp, lr, [sp, #-16]! /* push stack frame */
658 mov fp, sp 658 mov fp, sp
659 mov x9, x0 /* x9 := enckey */ 659 mov x9, x0 /* x9 := enckey */
660 mov x10, x3 /* x10 := nbytes */ 660 mov x10, x3 /* x10 := nbytes */
661 ldr q31, [x4] /* q31 := tweak */ 661 ldr q31, [x4] /* q31 := tweak */
6621: mov v24.16b, v31.16b /* q24 := tweak[0] */ 6621: mov v24.16b, v31.16b /* q24 := tweak[0] */
663 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 663 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
664 mov v25.16b, v31.16b /* q25 := tweak[1] */ 664 mov v25.16b, v31.16b /* q25 := tweak[1] */
665 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 665 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
666 mov v26.16b, v31.16b /* q26 := tweak[2] */ 666 mov v26.16b, v31.16b /* q26 := tweak[2] */
667 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 667 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
668 mov v27.16b, v31.16b /* q27 := tweak[3] */ 668 mov v27.16b, v31.16b /* q27 := tweak[3] */
669 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 669 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
670 mov v28.16b, v31.16b /* q28 := tweak[4] */ 670 mov v28.16b, v31.16b /* q28 := tweak[4] */
671 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 671 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
672 mov v29.16b, v31.16b /* q29 := tweak[5] */ 672 mov v29.16b, v31.16b /* q29 := tweak[5] */
673 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 673 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
674 mov v30.16b, v31.16b /* q30 := tweak[6] */ 674 mov v30.16b, v31.16b /* q30 := tweak[6] */
675 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 675 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
676 /* q31 := tweak[7] */ 676 /* q31 := tweak[7] */
677 ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */ 677 ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */
678 ldp q2, q3, [x1], #0x20 678 ldp q2, q3, [x1], #0x20
679 ldp q4, q5, [x1], #0x20 679 ldp q4, q5, [x1], #0x20
680 ldp q6, q7, [x1], #0x20 680 ldp q6, q7, [x1], #0x20
681 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ 681 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
682 eor v1.16b, v1.16b, v25.16b 682 eor v1.16b, v1.16b, v25.16b
683 eor v2.16b, v2.16b, v26.16b 683 eor v2.16b, v2.16b, v26.16b
684 eor v3.16b, v3.16b, v27.16b 684 eor v3.16b, v3.16b, v27.16b
685 eor v4.16b, v4.16b, v28.16b 685 eor v4.16b, v4.16b, v28.16b
686 eor v5.16b, v5.16b, v29.16b 686 eor v5.16b, v5.16b, v29.16b
687 eor v6.16b, v6.16b, v30.16b 687 eor v6.16b, v6.16b, v30.16b
688 eor v7.16b, v7.16b, v31.16b 688 eor v7.16b, v7.16b, v31.16b
689 mov x0, x9 /* x0 := enckey */ 689 mov x0, x9 /* x0 := enckey */
690 mov x3, x5 /* x3 := nrounds */ 690 mov x3, x5 /* x3 := nrounds */
691 bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */ 691 bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
692 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ 692 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
693 eor v1.16b, v1.16b, v25.16b 693 eor v1.16b, v1.16b, v25.16b
694 eor v2.16b, v2.16b, v26.16b 694 eor v2.16b, v2.16b, v26.16b
695 eor v3.16b, v3.16b, v27.16b 695 eor v3.16b, v3.16b, v27.16b
696 eor v4.16b, v4.16b, v28.16b 696 eor v4.16b, v4.16b, v28.16b
697 eor v5.16b, v5.16b, v29.16b 697 eor v5.16b, v5.16b, v29.16b
698 eor v6.16b, v6.16b, v30.16b 698 eor v6.16b, v6.16b, v30.16b
699 eor v7.16b, v7.16b, v31.16b 699 eor v7.16b, v7.16b, v31.16b
700 stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ 700 stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
701 stp q2, q3, [x2], #0x20 701 stp q2, q3, [x2], #0x20
702 stp q4, q5, [x2], #0x20 702 stp q4, q5, [x2], #0x20
703 stp q6, q7, [x2], #0x20 703 stp q6, q7, [x2], #0x20
704 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 704 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
705 subs x10, x10, #0x80 /* count down nbytes */ 705 subs x10, x10, #0x80 /* count down nbytes */
706 b.ne 1b /* repeat if more block groups */ 706 b.ne 1b /* repeat if more block groups */
707 str q31, [x4] /* update tweak */ 707 str q31, [x4] /* update tweak */
708 ldp fp, lr, [sp], #16 /* pop stack frame */ 708 ldp fp, lr, [sp], #16 /* pop stack frame */
709 ret 709 ret
710END(aesarmv8_xts_enc8) 710END(aesarmv8_xts_enc8)
711 711
712/* 712/*
713 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, 713 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
714 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 714 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
715 * uint32_t nrounds@x5) 715 * uint32_t nrounds@x5)
716 * 716 *
717 * Decrypt a contiguous sequdece of blocks with AES-XTS. 717 * Decrypt a contiguous sequdece of blocks with AES-XTS.
718 * 718 *
719 * nbytes must be a positive integral multiple of 16. This routine 719 * nbytes must be a positive integral multiple of 16. This routine
720 * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. 720 * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
721 * 721 *
722 * Standard ABI calling convention. 722 * Standard ABI calling convention.
723 */ 723 */
724ENTRY(aesarmv8_xts_dec1) 724ENTRY(aesarmv8_xts_dec1)
725 stp fp, lr, [sp, #-16]! /* push stack frame */ 725 stp fp, lr, [sp, #-16]! /* push stack frame */
726 mov fp, sp 726 mov fp, sp
727 mov x9, x0 /* x9 := deckey */ 727 mov x9, x0 /* x9 := deckey */
728 mov x10, x3 /* x10 := nbytes */ 728 mov x10, x3 /* x10 := nbytes */
729 ldr q31, [x4] /* q31 := tweak */ 729 ldr q31, [x4] /* q31 := tweak */
7301: ldr q0, [x1], #0x10 /* q0 := ctxt */ 7301: ldr q0, [x1], #0x10 /* q0 := ctxt */
731 mov x0, x9 /* x0 := deckey */ 731 mov x0, x9 /* x0 := deckey */
732 mov x3, x5 /* x3 := nrounds */ 732 mov x3, x5 /* x3 := nrounds */
733 eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */ 733 eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
734 bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */ 734 bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
735 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */ 735 eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
736 str q0, [x2], #0x10 /* store plaintext block */ 736 str q0, [x2], #0x10 /* store plaintext block */
737 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 737 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
738 subs x10, x10, #0x10 /* count down nbytes */ 738 subs x10, x10, #0x10 /* count down nbytes */
739 b.ne 1b /* repeat if more blocks */ 739 b.ne 1b /* repeat if more blocks */
740 str q31, [x4] /* update tweak */ 740 str q31, [x4] /* update tweak */
741 ldp fp, lr, [sp], #16 /* pop stack frame */ 741 ldp fp, lr, [sp], #16 /* pop stack frame */
742 ret 742 ret
743END(aesarmv8_xts_dec1) 743END(aesarmv8_xts_dec1)
744 744
745/* 745/*
746 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, 746 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
747 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, 747 * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
748 * uint32_t nrounds@x5) 748 * uint32_t nrounds@x5)
749 * 749 *
750 * Decrypt a contiguous sequdece of blocks with AES-XTS. 750 * Decrypt a contiguous sequdece of blocks with AES-XTS.
751 * 751 *
752 * nbytes must be a positive integral multiple of 128. 752 * nbytes must be a positive integral multiple of 128.
753 * 753 *
754 * Standard ABI calling convention. 754 * Standard ABI calling convention.
755 */ 755 */
756ENTRY(aesarmv8_xts_dec8) 756ENTRY(aesarmv8_xts_dec8)
757 stp fp, lr, [sp, #-16]! /* push stack frame */ 757 stp fp, lr, [sp, #-16]! /* push stack frame */
758 mov fp, sp 758 mov fp, sp
759 mov x9, x0 /* x9 := deckey */ 759 mov x9, x0 /* x9 := deckey */
760 mov x10, x3 /* x10 := nbytes */ 760 mov x10, x3 /* x10 := nbytes */
761 ldr q31, [x4] /* q31 := tweak */ 761 ldr q31, [x4] /* q31 := tweak */
7621: mov v24.16b, v31.16b /* q24 := tweak[0] */ 7621: mov v24.16b, v31.16b /* q24 := tweak[0] */
763 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 763 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
764 mov v25.16b, v31.16b /* q25 := tweak[1] */ 764 mov v25.16b, v31.16b /* q25 := tweak[1] */
765 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 765 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
766 mov v26.16b, v31.16b /* q26 := tweak[2] */ 766 mov v26.16b, v31.16b /* q26 := tweak[2] */
767 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 767 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
768 mov v27.16b, v31.16b /* q27 := tweak[3] */ 768 mov v27.16b, v31.16b /* q27 := tweak[3] */
769 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 769 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
770 mov v28.16b, v31.16b /* q28 := tweak[4] */ 770 mov v28.16b, v31.16b /* q28 := tweak[4] */
771 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 771 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
772 mov v29.16b, v31.16b /* q29 := tweak[5] */ 772 mov v29.16b, v31.16b /* q29 := tweak[5] */
773 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 773 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
774 mov v30.16b, v31.16b /* q30 := tweak[6] */ 774 mov v30.16b, v31.16b /* q30 := tweak[6] */
775 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 775 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
776 /* q31 := tweak[7] */ 776 /* q31 := tweak[7] */
777 ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */ 777 ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */
778 ldp q2, q3, [x1], #0x20 778 ldp q2, q3, [x1], #0x20
779 ldp q4, q5, [x1], #0x20 779 ldp q4, q5, [x1], #0x20
780 ldp q6, q7, [x1], #0x20 780 ldp q6, q7, [x1], #0x20
781 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */ 781 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
782 eor v1.16b, v1.16b, v25.16b 782 eor v1.16b, v1.16b, v25.16b
783 eor v2.16b, v2.16b, v26.16b 783 eor v2.16b, v2.16b, v26.16b
784 eor v3.16b, v3.16b, v27.16b 784 eor v3.16b, v3.16b, v27.16b
785 eor v4.16b, v4.16b, v28.16b 785 eor v4.16b, v4.16b, v28.16b
786 eor v5.16b, v5.16b, v29.16b 786 eor v5.16b, v5.16b, v29.16b
787 eor v6.16b, v6.16b, v30.16b 787 eor v6.16b, v6.16b, v30.16b
788 eor v7.16b, v7.16b, v31.16b 788 eor v7.16b, v7.16b, v31.16b
789 mov x0, x9 /* x0 := deckey */ 789 mov x0, x9 /* x0 := deckey */
790 mov x3, x5 /* x3 := nrounds */ 790 mov x3, x5 /* x3 := nrounds */
791 bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */ 791 bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
792 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ 792 eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
793 eor v1.16b, v1.16b, v25.16b 793 eor v1.16b, v1.16b, v25.16b
794 eor v2.16b, v2.16b, v26.16b 794 eor v2.16b, v2.16b, v26.16b
795 eor v3.16b, v3.16b, v27.16b 795 eor v3.16b, v3.16b, v27.16b
796 eor v4.16b, v4.16b, v28.16b 796 eor v4.16b, v4.16b, v28.16b
797 eor v5.16b, v5.16b, v29.16b 797 eor v5.16b, v5.16b, v29.16b
798 eor v6.16b, v6.16b, v30.16b 798 eor v6.16b, v6.16b, v30.16b
799 eor v7.16b, v7.16b, v31.16b 799 eor v7.16b, v7.16b, v31.16b
800 stp q0, q1, [x2], #0x20 /* store plaintext blocks */ 800 stp q0, q1, [x2], #0x20 /* store plaintext blocks */
801 stp q2, q3, [x2], #0x20 801 stp q2, q3, [x2], #0x20
802 stp q4, q5, [x2], #0x20 802 stp q4, q5, [x2], #0x20
803 stp q6, q7, [x2], #0x20 803 stp q6, q7, [x2], #0x20
804 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 804 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
805 subs x10, x10, #0x80 /* count down nbytes */ 805 subs x10, x10, #0x80 /* count down nbytes */
806 b.ne 1b /* repeat if more block groups */ 806 b.ne 1b /* repeat if more block groups */
807 str q31, [x4] /* update tweak */ 807 str q31, [x4] /* update tweak */
808 ldp fp, lr, [sp], #16 /* pop stack frame */ 808 ldp fp, lr, [sp], #16 /* pop stack frame */
809 ret 809 ret
810END(aesarmv8_xts_dec8) 810END(aesarmv8_xts_dec8)
811 811
812/* 812/*
813 * aesarmv8_xts_mulx(tweak@q31) 813 * aesarmv8_xts_mulx(tweak@q31)
814 * 814 *
815 * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. 815 * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
816 * Uses x0 and q0/q1 as temporaries. 816 * Uses x0 and q0/q1 as temporaries.
817 */ 817 */
818 .text 818 .text
819 _ALIGN_TEXT 819 _ALIGN_TEXT
820 .type aesarmv8_xts_mulx,@function 820 .type aesarmv8_xts_mulx,@function
821aesarmv8_xts_mulx: 821aesarmv8_xts_mulx:
822 /* 822 /*
823 * Simultaneously determine 823 * Simultaneously determine
824 * (a) whether the high bit of the low half must be 824 * (a) whether the high bit of the low half must be
825 * shifted into the low bit of the high half, and 825 * shifted into the low bit of the high half, and
826 * (b) whether the high bit of the high half must be 826 * (b) whether the high bit of the high half must be
827 * carried into x^128 = x^7 + x^2 + x + 1. 827 * carried into x^128 = x^7 + x^2 + x + 1.
828 */ 828 */
829 adrl x0, xtscarry 829 adrl x0, xtscarry
830 cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ 830 cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
831 ldr q0, [x0] /* q0 := xtscarry */ 831 ldr q0, [x0] /* q0 := xtscarry */
832 ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ 832 ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
833 shl v31.2d, v31.2d, #1 /* shift */ 833 shl v31.2d, v31.2d, #1 /* shift */
834 and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ 834 and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
835 eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */ 835 eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
836 ret 836 ret
837END(aesarmv8_xts_mulx) 837END(aesarmv8_xts_mulx)
838 838
839 .section .rodata 839 .section .rodata
840 .p2align 4 840 .p2align 4
841 .type xtscarry,@object 841 .type xtscarry,@object
842xtscarry: 842xtscarry:
843 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 843 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
844END(xtscarry) 844END(xtscarry)
845 845
846/* 846/*
847 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1) 847 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
848 * 848 *
849 * Update an AES-XTS tweak. 849 * Update an AES-XTS tweak.
850 * 850 *
851 * Standard ABI calling convention. 851 * Standard ABI calling convention.
852 */ 852 */
853ENTRY(aesarmv8_xts_update) 853ENTRY(aesarmv8_xts_update)
854 stp fp, lr, [sp, #-16]! /* push stack frame */ 854 stp fp, lr, [sp, #-16]! /* push stack frame */
855 mov fp, sp 855 mov fp, sp
856 ldr q31, [x0] /* load tweak */ 856 ldr q31, [x0] /* load tweak */
857 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ 857 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
858 str q31, [x1] /* store tweak */ 858 str q31, [x1] /* store tweak */
859 ldp fp, lr, [sp], #16 /* pop stack frame */ 859 ldp fp, lr, [sp], #16 /* pop stack frame */
860 ret 860 ret
861END(aesarmv8_xts_update) 861END(aesarmv8_xts_update)
862 862
863/* 863/*
864 * aesarmv8_enc1(const struct aesenc *enckey@x0, 864 * aesarmv8_enc1(const struct aesenc *enckey@x0,
865 * uint128_t block@q0, uint32_t nrounds@x3) 865 * uint128_t block@q0, uint32_t nrounds@x3)
866 * 866 *
867 * Encrypt a single AES block in q0. 867 * Encrypt a single AES block in q0.
868 * 868 *
869 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 869 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
870 */ 870 */
871 .text 871 .text
872 _ALIGN_TEXT 872 _ALIGN_TEXT
873 .type aesarmv8_enc1,@function 873 .type aesarmv8_enc1,@function
874aesarmv8_enc1: 874aesarmv8_enc1:
875 ldr q16, [x0], #0x10 /* load round key */ 875 ldr q16, [x0], #0x10 /* load round key */
8761: subs x3, x3, #1 8761: subs x3, x3, #1
877 /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ 877 /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
878 aese v0.16b, v16.16b 878 aese v0.16b, v16.16b
879 ldr q16, [x0], #0x10 /* load next round key */ 879 ldr q16, [x0], #0x10 /* load next round key */
880 b.eq 2f 880 b.eq 2f
881 /* q0 := MixColumns(q0) */ 881 /* q0 := MixColumns(q0) */
882 aesmc v0.16b, v0.16b 882 aesmc v0.16b, v0.16b
883 b 1b 883 b 1b
8842: eor v0.16b, v0.16b, v16.16b 8842: eor v0.16b, v0.16b, v16.16b
885 ret 885 ret
886END(aesarmv8_enc1) 886END(aesarmv8_enc1)
887 887
888/* 888/*
889 * aesarmv8_enc8(const struct aesenc *enckey@x0, 889 * aesarmv8_enc8(const struct aesenc *enckey@x0,
890 * uint128_t block0@q0, ..., uint128_t block7@q7, 890 * uint128_t block0@q0, ..., uint128_t block7@q7,
891 * uint32_t nrounds@x3) 891 * uint32_t nrounds@x3)
892 * 892 *
893 * Encrypt eight AES blocks in q0 through q7 in parallel. 893 * Encrypt eight AES blocks in q0 through q7 in parallel.
894 * 894 *
895 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 895 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
896 */ 896 */
897 .text 897 .text
898 _ALIGN_TEXT 898 _ALIGN_TEXT
899 .type aesarmv8_enc8,@function 899 .type aesarmv8_enc8,@function
900aesarmv8_enc8: 900aesarmv8_enc8:
901 ldr q16, [x0], #0x10 /* load round key */ 901 ldr q16, [x0], #0x10 /* load round key */
9021: subs x3, x3, #1 9021: subs x3, x3, #1
903 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ 903 /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
904 aese v0.16b, v16.16b 904 aese v0.16b, v16.16b
905 aese v1.16b, v16.16b 905 aese v1.16b, v16.16b
906 aese v2.16b, v16.16b 906 aese v2.16b, v16.16b
907 aese v3.16b, v16.16b 907 aese v3.16b, v16.16b
908 aese v4.16b, v16.16b 908 aese v4.16b, v16.16b
909 aese v5.16b, v16.16b 909 aese v5.16b, v16.16b
910 aese v6.16b, v16.16b 910 aese v6.16b, v16.16b
911 aese v7.16b, v16.16b 911 aese v7.16b, v16.16b
912 ldr q16, [x0], #0x10 /* load next round key */ 912 ldr q16, [x0], #0x10 /* load next round key */
913 b.eq 2f 913 b.eq 2f
914 /* q[i] := MixColumns(q[i]) */ 914 /* q[i] := MixColumns(q[i]) */
915 aesmc v0.16b, v0.16b 915 aesmc v0.16b, v0.16b
916 aesmc v1.16b, v1.16b 916 aesmc v1.16b, v1.16b
917 aesmc v2.16b, v2.16b 917 aesmc v2.16b, v2.16b
918 aesmc v3.16b, v3.16b 918 aesmc v3.16b, v3.16b
919 aesmc v4.16b, v4.16b 919 aesmc v4.16b, v4.16b
920 aesmc v5.16b, v5.16b 920 aesmc v5.16b, v5.16b
921 aesmc v6.16b, v6.16b 921 aesmc v6.16b, v6.16b
922 aesmc v7.16b, v7.16b 922 aesmc v7.16b, v7.16b
923 b 1b 923 b 1b
9242: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ 9242: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
925 eor v1.16b, v1.16b, v16.16b 925 eor v1.16b, v1.16b, v16.16b
926 eor v2.16b, v2.16b, v16.16b 926 eor v2.16b, v2.16b, v16.16b
927 eor v3.16b, v3.16b, v16.16b 927 eor v3.16b, v3.16b, v16.16b
928 eor v4.16b, v4.16b, v16.16b 928 eor v4.16b, v4.16b, v16.16b
929 eor v5.16b, v5.16b, v16.16b 929 eor v5.16b, v5.16b, v16.16b
930 eor v6.16b, v6.16b, v16.16b 930 eor v6.16b, v6.16b, v16.16b
931 eor v7.16b, v7.16b, v16.16b 931 eor v7.16b, v7.16b, v16.16b
932 ret 932 ret
933END(aesarmv8_enc8) 933END(aesarmv8_enc8)
934 934
935/* 935/*
936 * aesarmv8_dec1(const struct aesdec *deckey@x0, 936 * aesarmv8_dec1(const struct aesdec *deckey@x0,
937 * uint128_t block@q0, uint32_t nrounds@x3) 937 * uint128_t block@q0, uint32_t nrounds@x3)
938 * 938 *
939 * Decrypt a single AES block in q0. 939 * Decrypt a single AES block in q0.
940 * 940 *
941 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 941 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
942 */ 942 */
943 .text 943 .text
944 _ALIGN_TEXT 944 _ALIGN_TEXT
945 .type aesarmv8_dec1,@function 945 .type aesarmv8_dec1,@function
946aesarmv8_dec1: 946aesarmv8_dec1:
947 ldr q16, [x0], #0x10 /* load round key */ 947 ldr q16, [x0], #0x10 /* load round key */
9481: subs x3, x3, #1 9481: subs x3, x3, #1
949 /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ 949 /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
950 aesd v0.16b, v16.16b 950 aesd v0.16b, v16.16b
951 ldr q16, [x0], #0x10 /* load next round key */ 951 ldr q16, [x0], #0x10 /* load next round key */
952 b.eq 2f 952 b.eq 2f
953 /* q0 := InMixColumns(q0) */ 953 /* q0 := InMixColumns(q0) */
954 aesimc v0.16b, v0.16b 954 aesimc v0.16b, v0.16b
955 b 1b 955 b 1b
9562: eor v0.16b, v0.16b, v16.16b 9562: eor v0.16b, v0.16b, v16.16b
957 ret 957 ret
958END(aesarmv8_dec1) 958END(aesarmv8_dec1)
959 959
960/* 960/*
961 * aesarmv8_dec8(const struct aesdec *deckey@x0, 961 * aesarmv8_dec8(const struct aesdec *deckey@x0,
962 * uint128_t block0@q0, ..., uint128_t block7@q7, 962 * uint128_t block0@q0, ..., uint128_t block7@q7,
963 * uint32_t nrounds@x3) 963 * uint32_t nrounds@x3)
964 * 964 *
965 * Decrypt eight AES blocks in q0 through q7 in parallel. 965 * Decrypt eight AES blocks in q0 through q7 in parallel.
966 * 966 *
967 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. 967 * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
968 */ 968 */
969 .text 969 .text
970 _ALIGN_TEXT 970 _ALIGN_TEXT
971 .type aesarmv8_dec8,@function 971 .type aesarmv8_dec8,@function
972aesarmv8_dec8: 972aesarmv8_dec8:
973 ldr q16, [x0], #0x10 /* load round key */ 973 ldr q16, [x0], #0x10 /* load round key */
9741: subs x3, x3, #1 9741: subs x3, x3, #1
975 /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ 975 /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
976 aesd v0.16b, v16.16b 976 aesd v0.16b, v16.16b
977 aesd v1.16b, v16.16b 977 aesd v1.16b, v16.16b
978 aesd v2.16b, v16.16b 978 aesd v2.16b, v16.16b
979 aesd v3.16b, v16.16b 979 aesd v3.16b, v16.16b
980 aesd v4.16b, v16.16b 980 aesd v4.16b, v16.16b
981 aesd v5.16b, v16.16b 981 aesd v5.16b, v16.16b
982 aesd v6.16b, v16.16b 982 aesd v6.16b, v16.16b
983 aesd v7.16b, v16.16b 983 aesd v7.16b, v16.16b
984 ldr q16, [x0], #0x10 /* load next round key */ 984 ldr q16, [x0], #0x10 /* load next round key */
985 b.eq 2f 985 b.eq 2f
986 /* q[i] := InMixColumns(q[i]) */ 986 /* q[i] := InMixColumns(q[i]) */
987 aesimc v0.16b, v0.16b 987 aesimc v0.16b, v0.16b
988 aesimc v1.16b, v1.16b 988 aesimc v1.16b, v1.16b
989 aesimc v2.16b, v2.16b 989 aesimc v2.16b, v2.16b
990 aesimc v3.16b, v3.16b 990 aesimc v3.16b, v3.16b
991 aesimc v4.16b, v4.16b 991 aesimc v4.16b, v4.16b
992 aesimc v5.16b, v5.16b 992 aesimc v5.16b, v5.16b
993 aesimc v6.16b, v6.16b 993 aesimc v6.16b, v6.16b
994 aesimc v7.16b, v7.16b 994 aesimc v7.16b, v7.16b
995 b 1b 995 b 1b
9962: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ 9962: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
997 eor v1.16b, v1.16b, v16.16b 997 eor v1.16b, v1.16b, v16.16b
998 eor v2.16b, v2.16b, v16.16b 998 eor v2.16b, v2.16b, v16.16b
999 eor v3.16b, v3.16b, v16.16b 999 eor v3.16b, v3.16b, v16.16b
1000 eor v4.16b, v4.16b, v16.16b 1000 eor v4.16b, v4.16b, v16.16b
1001 eor v5.16b, v5.16b, v16.16b 1001 eor v5.16b, v5.16b, v16.16b
1002 eor v6.16b, v6.16b, v16.16b 1002 eor v6.16b, v6.16b, v16.16b
1003 eor v7.16b, v7.16b, v16.16b 1003 eor v7.16b, v7.16b, v16.16b
1004 ret 1004 ret
1005END(aesarmv8_dec8) 1005END(aesarmv8_dec8)