| @@ -1,1005 +1,1005 @@ | | | @@ -1,1005 +1,1005 @@ |
1 | /* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */ | | 1 | /* $NetBSD: aes_armv8_64.S,v 1.5 2020/07/19 07:32:43 ryo Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
15 | * | | 15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. | | 26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ | | 27 | */ |
28 | | | 28 | |
29 | #include <aarch64/asm.h> | | 29 | #include <aarch64/asm.h> |
30 | | | 30 | |
31 | .arch_extension aes | | 31 | .arch_extension aes |
32 | | | 32 | |
33 | /* | | 33 | /* |
34 | * uint32_t rcon[10] | | 34 | * uint32_t rcon[10] |
35 | * | | 35 | * |
36 | * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2). | | 36 | * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2). |
37 | * Such elements of GF(8) need only eight bits to be represented, | | 37 | * Such elements of GF(8) need only eight bits to be represented, |
38 | * but we store them in 4-byte units so we can copy one into all | | 38 | * but we store them in 4-byte units so we can copy one into all |
39 | * four 4-byte lanes of a vector register with a single LD1R. The | | 39 | * four 4-byte lanes of a vector register with a single LD1R. The |
40 | * access pattern is fixed, so indices into this table are never | | 40 | * access pattern is fixed, so indices into this table are never |
41 | * secret. | | 41 | * secret. |
42 | */ | | 42 | */ |
43 | .section .rodata | | 43 | .section .rodata |
44 | .p2align 2 | | 44 | .p2align 2 |
45 | .type rcon,@object | | 45 | .type rcon,@object |
46 | rcon: | | 46 | rcon: |
47 | .long 0x01 | | 47 | .long 0x01 |
48 | .long 0x02 | | 48 | .long 0x02 |
49 | .long 0x04 | | 49 | .long 0x04 |
50 | .long 0x08 | | 50 | .long 0x08 |
51 | .long 0x10 | | 51 | .long 0x10 |
52 | .long 0x20 | | 52 | .long 0x20 |
53 | .long 0x40 | | 53 | .long 0x40 |
54 | .long 0x80 | | 54 | .long 0x80 |
55 | .long 0x1b | | 55 | .long 0x1b |
56 | .long 0x36 | | 56 | .long 0x36 |
57 | END(rcon) | | 57 | END(rcon) |
58 | | | 58 | |
59 | /* | | 59 | /* |
60 | * uint128_t unshiftrows_rotword_1 | | 60 | * uint128_t unshiftrows_rotword_1 |
61 | * | | 61 | * |
62 | * Table for TBL instruction to undo ShiftRows, and then do | | 62 | * Table for TBL instruction to undo ShiftRows, and then do |
63 | * RotWord on word 1, and then copy it into all the other words. | | 63 | * RotWord on word 1, and then copy it into all the other words. |
64 | */ | | 64 | */ |
65 | .section .rodata | | 65 | .section .rodata |
66 | .p2align 4 | | 66 | .p2align 4 |
67 | .type unshiftrows_rotword_1,@object | | 67 | .type unshiftrows_rotword_1,@object |
68 | unshiftrows_rotword_1: | | 68 | unshiftrows_rotword_1: |
69 | .byte 0x01,0x0e,0x0b,0x04 | | 69 | .byte 0x01,0x0e,0x0b,0x04 |
70 | .byte 0x01,0x0e,0x0b,0x04 | | 70 | .byte 0x01,0x0e,0x0b,0x04 |
71 | .byte 0x01,0x0e,0x0b,0x04 | | 71 | .byte 0x01,0x0e,0x0b,0x04 |
72 | .byte 0x01,0x0e,0x0b,0x04 | | 72 | .byte 0x01,0x0e,0x0b,0x04 |
73 | END(unshiftrows_rotword_1) | | 73 | END(unshiftrows_rotword_1) |
74 | | | 74 | |
75 | /* | | 75 | /* |
76 | * uint128_t unshiftrows_3 | | 76 | * uint128_t unshiftrows_3 |
77 | * | | 77 | * |
78 | * Table for TBL instruction to undo ShiftRows, and then copy word | | 78 | * Table for TBL instruction to undo ShiftRows, and then copy word |
79 | * 3 into all the other words. | | 79 | * 3 into all the other words. |
80 | */ | | 80 | */ |
81 | .section .rodata | | 81 | .section .rodata |
82 | .p2align 4 | | 82 | .p2align 4 |
83 | .type unshiftrows_3,@object | | 83 | .type unshiftrows_3,@object |
84 | unshiftrows_3: | | 84 | unshiftrows_3: |
85 | .byte 0x0c,0x09,0x06,0x03 | | 85 | .byte 0x0c,0x09,0x06,0x03 |
86 | .byte 0x0c,0x09,0x06,0x03 | | 86 | .byte 0x0c,0x09,0x06,0x03 |
87 | .byte 0x0c,0x09,0x06,0x03 | | 87 | .byte 0x0c,0x09,0x06,0x03 |
88 | .byte 0x0c,0x09,0x06,0x03 | | 88 | .byte 0x0c,0x09,0x06,0x03 |
89 | END(unshiftrows_3) | | 89 | END(unshiftrows_3) |
90 | | | 90 | |
91 | /* | | 91 | /* |
92 | * uint128_t unshiftrows_rotword_3 | | 92 | * uint128_t unshiftrows_rotword_3 |
93 | * | | 93 | * |
94 | * Table for TBL instruction to undo ShiftRows, and then do | | 94 | * Table for TBL instruction to undo ShiftRows, and then do |
95 | * RotWord on word 3, and then copy it into all the other words. | | 95 | * RotWord on word 3, and then copy it into all the other words. |
96 | */ | | 96 | */ |
97 | .section .rodata | | 97 | .section .rodata |
98 | .p2align 4 | | 98 | .p2align 4 |
99 | .type unshiftrows_rotword_3,@object | | 99 | .type unshiftrows_rotword_3,@object |
100 | unshiftrows_rotword_3: | | 100 | unshiftrows_rotword_3: |
101 | .byte 0x09,0x06,0x03,0x0c | | 101 | .byte 0x09,0x06,0x03,0x0c |
102 | .byte 0x09,0x06,0x03,0x0c | | 102 | .byte 0x09,0x06,0x03,0x0c |
103 | .byte 0x09,0x06,0x03,0x0c | | 103 | .byte 0x09,0x06,0x03,0x0c |
104 | .byte 0x09,0x06,0x03,0x0c | | 104 | .byte 0x09,0x06,0x03,0x0c |
105 | END(unshiftrows_rotword_3) | | 105 | END(unshiftrows_rotword_3) |
106 | | | 106 | |
107 | /* | | 107 | /* |
108 | * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1) | | 108 | * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1) |
109 | * | | 109 | * |
110 | * Expand a 16-byte AES-128 key into 10 round keys. | | 110 | * Expand a 16-byte AES-128 key into 10 round keys. |
111 | * | | 111 | * |
112 | * Standard ABI calling convention. | | 112 | * Standard ABI calling convention. |
113 | */ | | 113 | */ |
114 | ENTRY(aesarmv8_setenckey128) | | 114 | ENTRY(aesarmv8_setenckey128) |
115 | ldr q1, [x1] /* q1 := master key */ | | 115 | ldr q1, [x1] /* q1 := master key */ |
116 | | | 116 | |
117 | adrl x4, unshiftrows_rotword_3 | | 117 | adrl x4, unshiftrows_rotword_3 |
118 | eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ | | 118 | eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ |
119 | ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */ | | 119 | ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */ |
120 | | | 120 | |
121 | str q1, [x0], #0x10 /* store master key as first round key */ | | 121 | str q1, [x0], #0x10 /* store master key as first round key */ |
122 | mov x2, #10 /* round count */ | | 122 | mov x2, #10 /* round count */ |
123 | adrl x3, rcon /* round constant */ | | 123 | adrl x3, rcon /* round constant */ |
124 | | | 124 | |
125 | 1: /* | | 125 | 1: /* |
126 | * q0 = 0 | | 126 | * q0 = 0 |
127 | * v1.4s = (prk[0], prk[1], prk[2], prk[3]) | | 127 | * v1.4s = (prk[0], prk[1], prk[2], prk[3]) |
128 | * x0 = pointer to round key to compute | | 128 | * x0 = pointer to round key to compute |
129 | * x2 = round count | | 129 | * x2 = round count |
130 | * x3 = rcon pointer | | 130 | * x3 = rcon pointer |
131 | */ | | 131 | */ |
132 | | | 132 | |
133 | /* q3 := ShiftRows(SubBytes(q1)) */ | | 133 | /* q3 := ShiftRows(SubBytes(q1)) */ |
134 | mov v3.16b, v1.16b | | 134 | mov v3.16b, v1.16b |
135 | aese v3.16b, v0.16b | | 135 | aese v3.16b, v0.16b |
136 | | | 136 | |
137 | /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ | | 137 | /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ |
138 | ld1r {v4.4s}, [x3], #4 | | 138 | ld1r {v4.4s}, [x3], #4 |
139 | tbl v3.16b, {v3.16b}, v16.16b | | 139 | tbl v3.16b, {v3.16b}, v16.16b |
140 | eor v3.16b, v3.16b, v4.16b | | 140 | eor v3.16b, v3.16b, v4.16b |
141 | | | 141 | |
142 | /* | | 142 | /* |
143 | * v5.4s := (0,prk[0],prk[1],prk[2]) | | 143 | * v5.4s := (0,prk[0],prk[1],prk[2]) |
144 | * v6.4s := (0,0,prk[0],prk[1]) | | 144 | * v6.4s := (0,0,prk[0],prk[1]) |
145 | * v7.4s := (0,0,0,prk[0]) | | 145 | * v7.4s := (0,0,0,prk[0]) |
146 | */ | | 146 | */ |
147 | ext v5.16b, v0.16b, v1.16b, #12 | | 147 | ext v5.16b, v0.16b, v1.16b, #12 |
148 | ext v6.16b, v0.16b, v1.16b, #8 | | 148 | ext v6.16b, v0.16b, v1.16b, #8 |
149 | ext v7.16b, v0.16b, v1.16b, #4 | | 149 | ext v7.16b, v0.16b, v1.16b, #4 |
150 | | | 150 | |
151 | /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ | | 151 | /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ |
152 | eor v1.16b, v1.16b, v3.16b | | 152 | eor v1.16b, v1.16b, v3.16b |
153 | eor v1.16b, v1.16b, v5.16b | | 153 | eor v1.16b, v1.16b, v5.16b |
154 | eor v1.16b, v1.16b, v6.16b | | 154 | eor v1.16b, v1.16b, v6.16b |
155 | eor v1.16b, v1.16b, v7.16b | | 155 | eor v1.16b, v1.16b, v7.16b |
156 | | | 156 | |
157 | subs x2, x2, #1 /* count down rounds */ | | 157 | subs x2, x2, #1 /* count down rounds */ |
158 | str q1, [x0], #0x10 /* store round key */ | | 158 | str q1, [x0], #0x10 /* store round key */ |
159 | b.ne 1b | | 159 | b.ne 1b |
160 | | | 160 | |
161 | ret | | 161 | ret |
162 | END(aesarmv8_setenckey128) | | 162 | END(aesarmv8_setenckey128) |
163 | | | 163 | |
164 | /* | | 164 | /* |
165 | * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1) | | 165 | * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1) |
166 | * | | 166 | * |
167 | * Expand a 24-byte AES-192 key into 12 round keys. | | 167 | * Expand a 24-byte AES-192 key into 12 round keys. |
168 | * | | 168 | * |
169 | * Standard ABI calling convention. | | 169 | * Standard ABI calling convention. |
170 | */ | | 170 | */ |
171 | ENTRY(aesarmv8_setenckey192) | | 171 | ENTRY(aesarmv8_setenckey192) |
172 | ldr q1, [x1], #0x10 /* q1 := master key[0:128) */ | | 172 | ldr q1, [x1], #0x10 /* q1 := master key[0:128) */ |
173 | ldr d2, [x1] /* d2 := master key[128:192) */ | | 173 | ldr d2, [x1] /* d2 := master key[128:192) */ |
174 | | | 174 | |
175 | adrl x4, unshiftrows_rotword_1 | | 175 | adrl x4, unshiftrows_rotword_1 |
176 | adrl x5, unshiftrows_rotword_3 | | 176 | adrl x5, unshiftrows_rotword_3 |
177 | eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ | | 177 | eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ |
178 | ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */ | | 178 | ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */ |
179 | ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */ | | 179 | ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */ |
180 | | | 180 | |
181 | str q1, [x0], #0x10 /* store master key[0:128) as round key */ | | 181 | str q1, [x0], #0x10 /* store master key[0:128) as round key */ |
182 | mov x2, #12 /* round count */ | | 182 | mov x2, #12 /* round count */ |
183 | adrl x3, rcon /* round constant */ | | 183 | adrl x3, rcon /* round constant */ |
184 | | | 184 | |
185 | 1: /* | | 185 | 1: /* |
186 | * q0 = 0 | | 186 | * q0 = 0 |
187 | * v1.4s = (prk[0], prk[1], prk[2], prk[3]) | | 187 | * v1.4s = (prk[0], prk[1], prk[2], prk[3]) |
188 | * v2.4s = (rklo[0], rklo[1], xxx, xxx) | | 188 | * v2.4s = (rklo[0], rklo[1], xxx, xxx) |
189 | * x0 = pointer to three round keys to compute | | 189 | * x0 = pointer to three round keys to compute |
190 | * x2 = round count | | 190 | * x2 = round count |
191 | * x3 = rcon pointer | | 191 | * x3 = rcon pointer |
192 | */ | | 192 | */ |
193 | | | 193 | |
194 | /* q3 := ShiftRows(SubBytes(q2)) */ | | 194 | /* q3 := ShiftRows(SubBytes(q2)) */ |
195 | mov v3.16b, v2.16b | | 195 | mov v3.16b, v2.16b |
196 | aese v3.16b, v0.16b | | 196 | aese v3.16b, v0.16b |
197 | | | 197 | |
198 | /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ | | 198 | /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ |
199 | ld1r {v4.4s}, [x3], #4 | | 199 | ld1r {v4.4s}, [x3], #4 |
200 | tbl v3.16b, {v3.16b}, v16.16b | | 200 | tbl v3.16b, {v3.16b}, v16.16b |
201 | eor v3.16b, v3.16b, v4.16b | | 201 | eor v3.16b, v3.16b, v4.16b |
202 | | | 202 | |
203 | /* | | 203 | /* |
204 | * We need to compute: | | 204 | * We need to compute: |
205 | * | | 205 | * |
206 | * rk[0] := rklo[0] | | 206 | * rk[0] := rklo[0] |
207 | * rk[1] := rklo[1] | | 207 | * rk[1] := rklo[1] |
208 | * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] | | 208 | * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] |
209 | * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] | | 209 | * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] |
210 | * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] | | 210 | * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] |
211 | * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] | | 211 | * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] |
212 | * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] | | 212 | * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] |
213 | * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] | | 213 | * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] |
214 | * ^ rklo[1] | | 214 | * ^ rklo[1] |
215 | */ | | 215 | */ |
216 | | | 216 | |
217 | /* | | 217 | /* |
218 | * v5.4s := (0,prk[0],prk[1],prk[2]) | | 218 | * v5.4s := (0,prk[0],prk[1],prk[2]) |
219 | * v6.4s := (0,0,prk[0],prk[1]) | | 219 | * v6.4s := (0,0,prk[0],prk[1]) |
220 | * v7.4s := (0,0,0,prk[0]) | | 220 | * v7.4s := (0,0,0,prk[0]) |
221 | */ | | 221 | */ |
222 | ext v5.16b, v0.16b, v1.16b, #12 | | 222 | ext v5.16b, v0.16b, v1.16b, #12 |
223 | ext v6.16b, v0.16b, v1.16b, #8 | | 223 | ext v6.16b, v0.16b, v1.16b, #8 |
224 | ext v7.16b, v0.16b, v1.16b, #4 | | 224 | ext v7.16b, v0.16b, v1.16b, #4 |
225 | | | 225 | |
226 | /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */ | | 226 | /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */ |
227 | eor v5.16b, v5.16b, v1.16b | | 227 | eor v5.16b, v5.16b, v1.16b |
228 | eor v5.16b, v5.16b, v3.16b | | 228 | eor v5.16b, v5.16b, v3.16b |
229 | eor v5.16b, v5.16b, v6.16b | | 229 | eor v5.16b, v5.16b, v6.16b |
230 | eor v5.16b, v5.16b, v7.16b | | 230 | eor v5.16b, v5.16b, v7.16b |
231 | | | 231 | |
232 | /* | | 232 | /* |
233 | * At this point, rk is split across v2.4s = (rk[0],rk[1],...) | | 233 | * At this point, rk is split across v2.4s = (rk[0],rk[1],...) |
234 | * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s = | | 234 | * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s = |
235 | * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or | | 235 | * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or |
236 | * nrk[3], which requires rklo[0] and rklo[1] in v2.4s = | | 236 | * nrk[3], which requires rklo[0] and rklo[1] in v2.4s = |
237 | * (rklo[0],rklo[1],...). | | 237 | * (rklo[0],rklo[1],...). |
238 | */ | | 238 | */ |
239 | | | 239 | |
240 | /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */ | | 240 | /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */ |
241 | dup v1.4s, v5.4s[3] | | 241 | dup v1.4s, v5.s[3] |
242 | mov v1.4s[0], v5.4s[2] | | 242 | mov v1.s[0], v5.s[2] |
243 | | | 243 | |
244 | /* | | 244 | /* |
245 | * v6.4s := (0, 0, rklo[0], rklo[1]) | | 245 | * v6.4s := (0, 0, rklo[0], rklo[1]) |
246 | * v7.4s := (0, 0, 0, rklo[0]) | | 246 | * v7.4s := (0, 0, 0, rklo[0]) |
247 | */ | | 247 | */ |
248 | ext v6.16b, v0.16b, v2.16b, #8 | | 248 | ext v6.16b, v0.16b, v2.16b, #8 |
249 | ext v7.16b, v0.16b, v2.16b, #4 | | 249 | ext v7.16b, v0.16b, v2.16b, #4 |
250 | | | 250 | |
251 | /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ | | 251 | /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ |
252 | eor v3.16b, v1.16b, v6.16b | | 252 | eor v3.16b, v1.16b, v6.16b |
253 | eor v3.16b, v3.16b, v7.16b | | 253 | eor v3.16b, v3.16b, v7.16b |
254 | | | 254 | |
255 | /* | | 255 | /* |
256 | * Recall v2.4s = (rk[0], rk[1], xxx, xxx) | | 256 | * Recall v2.4s = (rk[0], rk[1], xxx, xxx) |
257 | * and v5.4s = (rk[2], rk[3], xxx, xxx). Set | | 257 | * and v5.4s = (rk[2], rk[3], xxx, xxx). Set |
258 | * v2.4s := (rk[0], rk[1], rk[2], rk[3]) | | 258 | * v2.4s := (rk[0], rk[1], rk[2], rk[3]) |
259 | */ | | 259 | */ |
260 | mov v2.2d[1], v5.2d[0] | | 260 | mov v2.d[1], v5.d[0] |
261 | | | 261 | |
262 | /* store two round keys */ | | 262 | /* store two round keys */ |
263 | stp q2, q3, [x0], #0x20 | | 263 | stp q2, q3, [x0], #0x20 |
264 | | | 264 | |
265 | /* | | 265 | /* |
266 | * Live vector registers at this point: | | 266 | * Live vector registers at this point: |
267 | * | | 267 | * |
268 | * q0 = zero | | 268 | * q0 = zero |
269 | * q2 = rk | | 269 | * q2 = rk |
270 | * q3 = nrk | | 270 | * q3 = nrk |
271 | * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) | | 271 | * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) |
272 | * q16 = unshiftrows_rotword_1 | | 272 | * q16 = unshiftrows_rotword_1 |
273 | * q17 = unshiftrows_rotword_3 | | 273 | * q17 = unshiftrows_rotword_3 |
274 | * | | 274 | * |
275 | * We have to compute, in q1: | | 275 | * We have to compute, in q1: |
276 | * | | 276 | * |
277 | * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] | | 277 | * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] |
278 | * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] | | 278 | * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] |
279 | * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] | | 279 | * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] |
280 | * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] | | 280 | * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] |
281 | * ^ nrk[1] | | 281 | * ^ nrk[1] |
282 | * | | 282 | * |
283 | * And, if there's any more afterward, in q2: | | 283 | * And, if there's any more afterward, in q2: |
284 | * | | 284 | * |
285 | * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] | | 285 | * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] |
286 | * ^ nrk[1] ^ nrk[2] | | 286 | * ^ nrk[1] ^ nrk[2] |
287 | * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] | | 287 | * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] |
288 | * ^ nrk[1] ^ nrk[2] ^ nrk[3] | | 288 | * ^ nrk[1] ^ nrk[2] ^ nrk[3] |
289 | */ | | 289 | */ |
290 | | | 290 | |
291 | /* q1 := RotWords(SubBytes(q3)) */ | | 291 | /* q1 := RotWords(SubBytes(q3)) */ |
292 | mov v1.16b, v3.16b | | 292 | mov v1.16b, v3.16b |
293 | aese v1.16b, v0.16b | | 293 | aese v1.16b, v0.16b |
294 | | | 294 | |
295 | /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ | | 295 | /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ |
296 | ld1r {v4.4s}, [x3], #4 | | 296 | ld1r {v4.4s}, [x3], #4 |
297 | tbl v1.16b, {v1.16b}, v17.16b | | 297 | tbl v1.16b, {v1.16b}, v17.16b |
298 | eor v1.16b, v1.16b, v4.16b | | 298 | eor v1.16b, v1.16b, v4.16b |
299 | | | 299 | |
300 | /* | | 300 | /* |
301 | * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already] | | 301 | * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already] |
302 | * v4.4s := (0, rk[2], rk[3], nrk[0]) | | 302 | * v4.4s := (0, rk[2], rk[3], nrk[0]) |
303 | * v6.4s := (0, 0, rk[2], rk[3]) | | 303 | * v6.4s := (0, 0, rk[2], rk[3]) |
304 | * v7.4s := (0, 0, 0, rk[2]) | | 304 | * v7.4s := (0, 0, 0, rk[2]) |
305 | */ | | 305 | */ |
306 | ext v4.16b, v0.16b, v5.16b, #12 | | 306 | ext v4.16b, v0.16b, v5.16b, #12 |
307 | ext v6.16b, v0.16b, v5.16b, #8 | | 307 | ext v6.16b, v0.16b, v5.16b, #8 |
308 | ext v7.16b, v0.16b, v5.16b, #4 | | 308 | ext v7.16b, v0.16b, v5.16b, #4 |
309 | | | 309 | |
310 | /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */ | | 310 | /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */ |
311 | eor v1.16b, v1.16b, v5.16b | | 311 | eor v1.16b, v1.16b, v5.16b |
312 | eor v1.16b, v1.16b, v4.16b | | 312 | eor v1.16b, v1.16b, v4.16b |
313 | eor v1.16b, v1.16b, v6.16b | | 313 | eor v1.16b, v1.16b, v6.16b |
314 | eor v1.16b, v1.16b, v7.16b | | 314 | eor v1.16b, v1.16b, v7.16b |
315 | | | 315 | |
316 | subs x2, x2, #3 /* count down three rounds */ | | 316 | subs x2, x2, #3 /* count down three rounds */ |
317 | str q1, [x0], #0x10 /* store third round key */ | | 317 | str q1, [x0], #0x10 /* store third round key */ |
318 | b.eq 2f | | 318 | b.eq 2f |
319 | | | 319 | |
320 | /* | | 320 | /* |
321 | * v4.4s := (nrk[2], nrk[3], xxx, xxx) | | 321 | * v4.4s := (nrk[2], nrk[3], xxx, xxx) |
322 | * v5.4s := (0, nrk[2], xxx, xxx) | | 322 | * v5.4s := (0, nrk[2], xxx, xxx) |
323 | */ | | 323 | */ |
324 | ext v4.16b, v3.16b, v0.16b, #8 | | 324 | ext v4.16b, v3.16b, v0.16b, #8 |
325 | ext v5.16b, v0.16b, v4.16b, #12 | | 325 | ext v5.16b, v0.16b, v4.16b, #12 |
326 | | | 326 | |
327 | /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */ | | 327 | /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */ |
328 | dup v2.4s, v1.4s[3] | | 328 | dup v2.4s, v1.s[3] |
329 | | | 329 | |
330 | /* | | 330 | /* |
331 | * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2], | | 331 | * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2], |
332 | * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3], | | 332 | * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3], |
333 | * xxx, xxx) | | 333 | * xxx, xxx) |
334 | */ | | 334 | */ |
335 | eor v2.16b, v2.16b, v4.16b | | 335 | eor v2.16b, v2.16b, v4.16b |
336 | eor v2.16b, v2.16b, v5.16b | | 336 | eor v2.16b, v2.16b, v5.16b |
337 | | | 337 | |
338 | b 1b | | 338 | b 1b |
339 | | | 339 | |
340 | 2: ret | | 340 | 2: ret |
341 | END(aesarmv8_setenckey192) | | 341 | END(aesarmv8_setenckey192) |
342 | | | 342 | |
343 | /* | | 343 | /* |
344 | * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1) | | 344 | * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1) |
345 | * | | 345 | * |
346 | * Expand a 32-byte AES-256 key into 14 round keys. | | 346 | * Expand a 32-byte AES-256 key into 14 round keys. |
347 | * | | 347 | * |
348 | * Standard ABI calling convention. | | 348 | * Standard ABI calling convention. |
349 | */ | | 349 | */ |
350 | ENTRY(aesarmv8_setenckey256) | | 350 | ENTRY(aesarmv8_setenckey256) |
351 | /* q1 := key[0:128), q2 := key[128:256) */ | | 351 | /* q1 := key[0:128), q2 := key[128:256) */ |
352 | ldp q1, q2, [x1], #0x20 | | 352 | ldp q1, q2, [x1], #0x20 |
353 | | | 353 | |
354 | adrl x4, unshiftrows_rotword_3 | | 354 | adrl x4, unshiftrows_rotword_3 |
355 | adrl x5, unshiftrows_3 | | 355 | adrl x5, unshiftrows_3 |
356 | eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ | | 356 | eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ |
357 | ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */ | | 357 | ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */ |
358 | ldr q17, [x5] /* q17 := unshiftrows_3 */ | | 358 | ldr q17, [x5] /* q17 := unshiftrows_3 */ |
359 | | | 359 | |
360 | /* store master key as first two round keys */ | | 360 | /* store master key as first two round keys */ |
361 | stp q1, q2, [x0], #0x20 | | 361 | stp q1, q2, [x0], #0x20 |
362 | mov x2, #14 /* round count */ | | 362 | mov x2, #14 /* round count */ |
363 | adrl x3, rcon /* round constant */ | | 363 | adrl x3, rcon /* round constant */ |
364 | | | 364 | |
365 | 1: /* | | 365 | 1: /* |
366 | * q0 = 0 | | 366 | * q0 = 0 |
367 | * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3]) | | 367 | * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3]) |
368 | * v2.4s = (prk[0], prk[1], prk[2], prk[3]) | | 368 | * v2.4s = (prk[0], prk[1], prk[2], prk[3]) |
369 | * x2 = round count | | 369 | * x2 = round count |
370 | * x3 = rcon pointer | | 370 | * x3 = rcon pointer |
371 | */ | | 371 | */ |
372 | | | 372 | |
373 | /* q3 := ShiftRows(SubBytes(q2)) */ | | 373 | /* q3 := ShiftRows(SubBytes(q2)) */ |
374 | mov v3.16b, v2.16b | | 374 | mov v3.16b, v2.16b |
375 | aese v3.16b, v0.16b | | 375 | aese v3.16b, v0.16b |
376 | | | 376 | |
377 | /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ | | 377 | /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ |
378 | ld1r {v4.4s}, [x3], #4 | | 378 | ld1r {v4.4s}, [x3], #4 |
379 | tbl v3.16b, {v3.16b}, v16.16b | | 379 | tbl v3.16b, {v3.16b}, v16.16b |
380 | eor v3.16b, v3.16b, v4.16b | | 380 | eor v3.16b, v3.16b, v4.16b |
381 | | | 381 | |
382 | /* | | 382 | /* |
383 | * v5.4s := (0,pprk[0],pprk[1],pprk[2]) | | 383 | * v5.4s := (0,pprk[0],pprk[1],pprk[2]) |
384 | * v6.4s := (0,0,pprk[0],pprk[1]) | | 384 | * v6.4s := (0,0,pprk[0],pprk[1]) |
385 | * v7.4s := (0,0,0,pprk[0]) | | 385 | * v7.4s := (0,0,0,pprk[0]) |
386 | */ | | 386 | */ |
387 | ext v5.16b, v0.16b, v1.16b, #12 | | 387 | ext v5.16b, v0.16b, v1.16b, #12 |
388 | ext v6.16b, v0.16b, v1.16b, #8 | | 388 | ext v6.16b, v0.16b, v1.16b, #8 |
389 | ext v7.16b, v0.16b, v1.16b, #4 | | 389 | ext v7.16b, v0.16b, v1.16b, #4 |
390 | | | 390 | |
391 | /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ | | 391 | /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ |
392 | eor v1.16b, v1.16b, v3.16b | | 392 | eor v1.16b, v1.16b, v3.16b |
393 | eor v1.16b, v1.16b, v5.16b | | 393 | eor v1.16b, v1.16b, v5.16b |
394 | eor v1.16b, v1.16b, v6.16b | | 394 | eor v1.16b, v1.16b, v6.16b |
395 | eor v1.16b, v1.16b, v7.16b | | 395 | eor v1.16b, v1.16b, v7.16b |
396 | | | 396 | |
397 | subs x2, x2, #2 /* count down two rounds */ | | 397 | subs x2, x2, #2 /* count down two rounds */ |
398 | b.eq 2f /* stop if this is the last one */ | | 398 | b.eq 2f /* stop if this is the last one */ |
399 | | | 399 | |
400 | /* q3 := ShiftRows(SubBytes(q1)) */ | | 400 | /* q3 := ShiftRows(SubBytes(q1)) */ |
401 | mov v3.16b, v1.16b | | 401 | mov v3.16b, v1.16b |
402 | aese v3.16b, v0.16b | | 402 | aese v3.16b, v0.16b |
403 | | | 403 | |
404 | /* v3.4s[i] := SubBytes(rk[3]) */ | | 404 | /* v3.4s[i] := SubBytes(rk[3]) */ |
405 | tbl v3.16b, {v3.16b}, v17.16b | | 405 | tbl v3.16b, {v3.16b}, v17.16b |
406 | | | 406 | |
407 | /* | | 407 | /* |
408 | * v5.4s := (0,prk[0],prk[1],prk[2]) | | 408 | * v5.4s := (0,prk[0],prk[1],prk[2]) |
409 | * v6.4s := (0,0,prk[0],prk[1]) | | 409 | * v6.4s := (0,0,prk[0],prk[1]) |
410 | * v7.4s := (0,0,0,prk[0]) | | 410 | * v7.4s := (0,0,0,prk[0]) |
411 | */ | | 411 | */ |
412 | ext v5.16b, v0.16b, v2.16b, #12 | | 412 | ext v5.16b, v0.16b, v2.16b, #12 |
413 | ext v6.16b, v0.16b, v2.16b, #8 | | 413 | ext v6.16b, v0.16b, v2.16b, #8 |
414 | ext v7.16b, v0.16b, v2.16b, #4 | | 414 | ext v7.16b, v0.16b, v2.16b, #4 |
415 | | | 415 | |
416 | /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ | | 416 | /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ |
417 | eor v2.16b, v2.16b, v3.16b | | 417 | eor v2.16b, v2.16b, v3.16b |
418 | eor v2.16b, v2.16b, v5.16b | | 418 | eor v2.16b, v2.16b, v5.16b |
419 | eor v2.16b, v2.16b, v6.16b | | 419 | eor v2.16b, v2.16b, v6.16b |
420 | eor v2.16b, v2.16b, v7.16b | | 420 | eor v2.16b, v2.16b, v7.16b |
421 | | | 421 | |
422 | stp q1, q2, [x0], #0x20 /* store two round keys */ | | 422 | stp q1, q2, [x0], #0x20 /* store two round keys */ |
423 | b 1b | | 423 | b 1b |
424 | | | 424 | |
425 | 2: str q1, [x0] /* store last round key */ | | 425 | 2: str q1, [x0] /* store last round key */ |
426 | ret | | 426 | ret |
427 | END(aesarmv8_setenckey256) | | 427 | END(aesarmv8_setenckey256) |
428 | | | 428 | |
429 | /* | | 429 | /* |
430 | * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1, | | 430 | * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1, |
431 | * uint32_t nrounds@x2) | | 431 | * uint32_t nrounds@x2) |
432 | * | | 432 | * |
433 | * Convert AES encryption round keys to AES decryption round keys. | | 433 | * Convert AES encryption round keys to AES decryption round keys. |
434 | * `rounds' must be between 10 and 14. | | 434 | * `rounds' must be between 10 and 14. |
435 | * | | 435 | * |
436 | * Standard ABI calling convention. | | 436 | * Standard ABI calling convention. |
437 | */ | | 437 | */ |
438 | ENTRY(aesarmv8_enctodec) | | 438 | ENTRY(aesarmv8_enctodec) |
439 | ldr q0, [x0, x2, lsl #4] /* load last round key */ | | 439 | ldr q0, [x0, x2, lsl #4] /* load last round key */ |
440 | 1: str q0, [x1], #0x10 /* store round key */ | | 440 | 1: str q0, [x1], #0x10 /* store round key */ |
441 | subs x2, x2, #1 /* count down round */ | | 441 | subs x2, x2, #1 /* count down round */ |
442 | ldr q0, [x0, x2, lsl #4] /* load previous round key */ | | 442 | ldr q0, [x0, x2, lsl #4] /* load previous round key */ |
443 | b.eq 2f /* stop if this is the last one */ | | 443 | b.eq 2f /* stop if this is the last one */ |
444 | aesimc v0.16b, v0.16b /* convert encryption to decryption */ | | 444 | aesimc v0.16b, v0.16b /* convert encryption to decryption */ |
445 | b 1b | | 445 | b 1b |
446 | 2: str q0, [x1] /* store first round key verbatim */ | | 446 | 2: str q0, [x1] /* store first round key verbatim */ |
447 | ret | | 447 | ret |
448 | END(aesarmv8_enctodec) | | 448 | END(aesarmv8_enctodec) |
449 | | | 449 | |
450 | /* | | 450 | /* |
451 | * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1, | | 451 | * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1, |
452 | * uint8_t out[16] @x2, uint32_t nrounds@x3) | | 452 | * uint8_t out[16] @x2, uint32_t nrounds@x3) |
453 | * | | 453 | * |
454 | * Encrypt a single block. | | 454 | * Encrypt a single block. |
455 | * | | 455 | * |
456 | * Standard ABI calling convention. | | 456 | * Standard ABI calling convention. |
457 | */ | | 457 | */ |
458 | ENTRY(aesarmv8_enc) | | 458 | ENTRY(aesarmv8_enc) |
459 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 459 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
460 | mov fp, sp | | 460 | mov fp, sp |
461 | ldr q0, [x1] /* q0 := ptxt */ | | 461 | ldr q0, [x1] /* q0 := ptxt */ |
462 | bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ | | 462 | bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ |
463 | str q0, [x2] /* store ctxt */ | | 463 | str q0, [x2] /* store ctxt */ |
464 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 464 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
465 | ret | | 465 | ret |
466 | END(aesarmv8_enc) | | 466 | END(aesarmv8_enc) |
467 | | | 467 | |
468 | /* | | 468 | /* |
469 | * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1, | | 469 | * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1, |
470 | * uint8_t out[16] @x2, uint32_t nrounds@x3) | | 470 | * uint8_t out[16] @x2, uint32_t nrounds@x3) |
471 | * | | 471 | * |
472 | * Decrypt a single block. | | 472 | * Decrypt a single block. |
473 | * | | 473 | * |
474 | * Standard ABI calling convention. | | 474 | * Standard ABI calling convention. |
475 | */ | | 475 | */ |
476 | ENTRY(aesarmv8_dec) | | 476 | ENTRY(aesarmv8_dec) |
477 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 477 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
478 | mov fp, sp | | 478 | mov fp, sp |
479 | ldr q0, [x1] /* q0 := ctxt */ | | 479 | ldr q0, [x1] /* q0 := ctxt */ |
480 | bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ | | 480 | bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ |
481 | str q0, [x2] /* store ptxt */ | | 481 | str q0, [x2] /* store ptxt */ |
482 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 482 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
483 | ret | | 483 | ret |
484 | END(aesarmv8_dec) | | 484 | END(aesarmv8_dec) |
485 | | | 485 | |
486 | /* | | 486 | /* |
487 | * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1, | | 487 | * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1, |
488 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4, | | 488 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4, |
489 | * uint32_t nrounds@x5) | | 489 | * uint32_t nrounds@x5) |
490 | * | | 490 | * |
491 | * Encrypt a contiguous sequence of blocks with AES-CBC. | | 491 | * Encrypt a contiguous sequence of blocks with AES-CBC. |
492 | * | | 492 | * |
493 | * nbytes must be an integral multiple of 16. | | 493 | * nbytes must be an integral multiple of 16. |
494 | * | | 494 | * |
495 | * Standard ABI calling convention. | | 495 | * Standard ABI calling convention. |
496 | */ | | 496 | */ |
497 | ENTRY(aesarmv8_cbc_enc) | | 497 | ENTRY(aesarmv8_cbc_enc) |
498 | cbz x3, 2f /* stop if nothing to do */ | | 498 | cbz x3, 2f /* stop if nothing to do */ |
499 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 499 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
500 | mov fp, sp | | 500 | mov fp, sp |
501 | mov x9, x0 /* x9 := enckey */ | | 501 | mov x9, x0 /* x9 := enckey */ |
502 | mov x10, x3 /* x10 := nbytes */ | | 502 | mov x10, x3 /* x10 := nbytes */ |
503 | ldr q0, [x4] /* q0 := chaining value */ | | 503 | ldr q0, [x4] /* q0 := chaining value */ |
504 | 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */ | | 504 | 1: ldr q1, [x1], #0x10 /* q1 := plaintext block */ |
505 | eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ | | 505 | eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ |
506 | mov x0, x9 /* x0 := enckey */ | | 506 | mov x0, x9 /* x0 := enckey */ |
507 | mov x3, x5 /* x3 := nrounds */ | | 507 | mov x3, x5 /* x3 := nrounds */ |
508 | bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ | | 508 | bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ |
509 | subs x10, x10, #0x10 /* count down nbytes */ | | 509 | subs x10, x10, #0x10 /* count down nbytes */ |
510 | str q0, [x2], #0x10 /* store ciphertext block */ | | 510 | str q0, [x2], #0x10 /* store ciphertext block */ |
511 | b.ne 1b /* repeat if x10 is nonzero */ | | 511 | b.ne 1b /* repeat if x10 is nonzero */ |
512 | str q0, [x4] /* store chaining value */ | | 512 | str q0, [x4] /* store chaining value */ |
513 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 513 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
514 | 2: ret | | 514 | 2: ret |
515 | END(aesarmv8_cbc_enc) | | 515 | END(aesarmv8_cbc_enc) |
516 | | | 516 | |
517 | /* | | 517 | /* |
518 | * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, | | 518 | * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, |
519 | * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, | | 519 | * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, |
520 | * uint32_t nrounds@x5) | | 520 | * uint32_t nrounds@x5) |
521 | * | | 521 | * |
522 | * Decrypt a contiguous sequence of blocks with AES-CBC. | | 522 | * Decrypt a contiguous sequence of blocks with AES-CBC. |
523 | * | | 523 | * |
524 | * nbytes must be a positive integral multiple of 16. This routine | | 524 | * nbytes must be a positive integral multiple of 16. This routine |
525 | * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once. | | 525 | * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once. |
526 | * | | 526 | * |
527 | * Standard ABI calling convention. | | 527 | * Standard ABI calling convention. |
528 | */ | | 528 | */ |
529 | ENTRY(aesarmv8_cbc_dec1) | | 529 | ENTRY(aesarmv8_cbc_dec1) |
530 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 530 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
531 | mov fp, sp | | 531 | mov fp, sp |
532 | ldr q24, [x4] /* q24 := iv */ | | 532 | ldr q24, [x4] /* q24 := iv */ |
533 | mov x9, x0 /* x9 := enckey */ | | 533 | mov x9, x0 /* x9 := enckey */ |
534 | mov x10, x3 /* x10 := nbytes */ | | 534 | mov x10, x3 /* x10 := nbytes */ |
535 | add x1, x1, x3 /* x1 := pointer past end of in */ | | 535 | add x1, x1, x3 /* x1 := pointer past end of in */ |
536 | add x2, x2, x3 /* x2 := pointer past end of out */ | | 536 | add x2, x2, x3 /* x2 := pointer past end of out */ |
537 | ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */ | | 537 | ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */ |
538 | str q0, [x4] /* update iv */ | | 538 | str q0, [x4] /* update iv */ |
539 | 1: mov x0, x9 /* x0 := enckey */ | | 539 | 1: mov x0, x9 /* x0 := enckey */ |
540 | mov x3, x5 /* x3 := nrounds */ | | 540 | mov x3, x5 /* x3 := nrounds */ |
541 | bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ | | 541 | bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ |
542 | subs x10, x10, #0x10 /* count down nbytes */ | | 542 | subs x10, x10, #0x10 /* count down nbytes */ |
543 | b.eq 2f /* stop if this is the first block */ | | 543 | b.eq 2f /* stop if this is the first block */ |
544 | ldr q31, [x1, #-0x10]! /* q31 := chaining value */ | | 544 | ldr q31, [x1, #-0x10]! /* q31 := chaining value */ |
545 | eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ | | 545 | eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ |
546 | str q0, [x2, #-0x10]! /* store plaintext block */ | | 546 | str q0, [x2, #-0x10]! /* store plaintext block */ |
547 | mov v0.16b, v31.16b /* move cv = ciphertext block */ | | 547 | mov v0.16b, v31.16b /* move cv = ciphertext block */ |
548 | b 1b | | 548 | b 1b |
549 | 2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ | | 549 | 2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ |
550 | str q0, [x2, #-0x10]! /* store first plaintext block */ | | 550 | str q0, [x2, #-0x10]! /* store first plaintext block */ |
551 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 551 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
552 | ret | | 552 | ret |
553 | END(aesarmv8_cbc_dec1) | | 553 | END(aesarmv8_cbc_dec1) |
554 | | | 554 | |
555 | /* | | 555 | /* |
556 | * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, | | 556 | * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, |
557 | * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, | | 557 | * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, |
558 | * uint32_t nrounds@x5) | | 558 | * uint32_t nrounds@x5) |
559 | * | | 559 | * |
560 | * Decrypt a contiguous sequence of 8-block units with AES-CBC. | | 560 | * Decrypt a contiguous sequence of 8-block units with AES-CBC. |
561 | * | | 561 | * |
562 | * nbytes must be a positive integral multiple of 128. | | 562 | * nbytes must be a positive integral multiple of 128. |
563 | * | | 563 | * |
564 | * Standard ABI calling convention. | | 564 | * Standard ABI calling convention. |
565 | */ | | 565 | */ |
566 | ENTRY(aesarmv8_cbc_dec8) | | 566 | ENTRY(aesarmv8_cbc_dec8) |
567 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 567 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
568 | mov fp, sp | | 568 | mov fp, sp |
569 | ldr q24, [x4] /* q24 := iv */ | | 569 | ldr q24, [x4] /* q24 := iv */ |
570 | mov x9, x0 /* x9 := enckey */ | | 570 | mov x9, x0 /* x9 := enckey */ |
571 | mov x10, x3 /* x10 := nbytes */ | | 571 | mov x10, x3 /* x10 := nbytes */ |
572 | add x1, x1, x3 /* x1 := pointer past end of in */ | | 572 | add x1, x1, x3 /* x1 := pointer past end of in */ |
573 | add x2, x2, x3 /* x2 := pointer past end of out */ | | 573 | add x2, x2, x3 /* x2 := pointer past end of out */ |
574 | ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */ | | 574 | ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */ |
575 | str q7, [x4] /* update iv */ | | 575 | str q7, [x4] /* update iv */ |
576 | 1: ldp q4, q5, [x1, #-0x20]! | | 576 | 1: ldp q4, q5, [x1, #-0x20]! |
577 | ldp q2, q3, [x1, #-0x20]! | | 577 | ldp q2, q3, [x1, #-0x20]! |
578 | ldp q0, q1, [x1, #-0x20]! | | 578 | ldp q0, q1, [x1, #-0x20]! |
579 | mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */ | | 579 | mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */ |
580 | mov v30.16b, v5.16b | | 580 | mov v30.16b, v5.16b |
581 | mov v29.16b, v4.16b | | 581 | mov v29.16b, v4.16b |
582 | mov v28.16b, v3.16b | | 582 | mov v28.16b, v3.16b |
583 | mov v27.16b, v2.16b | | 583 | mov v27.16b, v2.16b |
584 | mov v26.16b, v1.16b | | 584 | mov v26.16b, v1.16b |
585 | mov v25.16b, v0.16b | | 585 | mov v25.16b, v0.16b |
586 | mov x0, x9 /* x0 := enckey */ | | 586 | mov x0, x9 /* x0 := enckey */ |
587 | mov x3, x5 /* x3 := nrounds */ | | 587 | mov x3, x5 /* x3 := nrounds */ |
588 | bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i]; | | 588 | bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i]; |
589 | * trash x0/x3/q16 */ | | 589 | * trash x0/x3/q16 */ |
590 | eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */ | | 590 | eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */ |
591 | eor v6.16b, v6.16b, v30.16b | | 591 | eor v6.16b, v6.16b, v30.16b |
592 | eor v5.16b, v5.16b, v29.16b | | 592 | eor v5.16b, v5.16b, v29.16b |
593 | eor v4.16b, v4.16b, v28.16b | | 593 | eor v4.16b, v4.16b, v28.16b |
594 | eor v3.16b, v3.16b, v27.16b | | 594 | eor v3.16b, v3.16b, v27.16b |
595 | eor v2.16b, v2.16b, v26.16b | | 595 | eor v2.16b, v2.16b, v26.16b |
596 | eor v1.16b, v1.16b, v25.16b | | 596 | eor v1.16b, v1.16b, v25.16b |
597 | subs x10, x10, #0x80 /* count down nbytes */ | | 597 | subs x10, x10, #0x80 /* count down nbytes */ |
598 | stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */ | | 598 | stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */ |
599 | stp q4, q5, [x2, #-0x20]! | | 599 | stp q4, q5, [x2, #-0x20]! |
600 | stp q2, q3, [x2, #-0x20]! | | 600 | stp q2, q3, [x2, #-0x20]! |
601 | b.eq 2f /* stop if this is the first block */ | | 601 | b.eq 2f /* stop if this is the first block */ |
602 | ldp q6, q7, [x1, #-0x20]! | | 602 | ldp q6, q7, [x1, #-0x20]! |
603 | eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ | | 603 | eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ |
604 | stp q0, q1, [x2, #-0x20]! | | 604 | stp q0, q1, [x2, #-0x20]! |
605 | b 1b | | 605 | b 1b |
606 | 2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */ | | 606 | 2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */ |
607 | stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */ | | 607 | stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */ |
608 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 608 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
609 | ret | | 609 | ret |
610 | END(aesarmv8_cbc_dec8) | | 610 | END(aesarmv8_cbc_dec8) |
611 | | | 611 | |
612 | /* | | 612 | /* |
613 | * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1, | | 613 | * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1, |
614 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, | | 614 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, |
615 | * uint32_t nrounds@x5) | | 615 | * uint32_t nrounds@x5) |
616 | * | | 616 | * |
617 | * Encrypt a contiguous sequence of blocks with AES-XTS. | | 617 | * Encrypt a contiguous sequence of blocks with AES-XTS. |
618 | * | | 618 | * |
619 | * nbytes must be a positive integral multiple of 16. This routine | | 619 | * nbytes must be a positive integral multiple of 16. This routine |
620 | * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once. | | 620 | * is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once. |
621 | * | | 621 | * |
622 | * Standard ABI calling convention. | | 622 | * Standard ABI calling convention. |
623 | */ | | 623 | */ |
624 | ENTRY(aesarmv8_xts_enc1) | | 624 | ENTRY(aesarmv8_xts_enc1) |
625 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 625 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
626 | mov fp, sp | | 626 | mov fp, sp |
627 | mov x9, x0 /* x9 := enckey */ | | 627 | mov x9, x0 /* x9 := enckey */ |
628 | mov x10, x3 /* x10 := nbytes */ | | 628 | mov x10, x3 /* x10 := nbytes */ |
629 | ldr q31, [x4] /* q31 := tweak */ | | 629 | ldr q31, [x4] /* q31 := tweak */ |
630 | 1: ldr q0, [x1], #0x10 /* q0 := ptxt */ | | 630 | 1: ldr q0, [x1], #0x10 /* q0 := ptxt */ |
631 | mov x0, x9 /* x0 := enckey */ | | 631 | mov x0, x9 /* x0 := enckey */ |
632 | mov x3, x5 /* x3 := nrounds */ | | 632 | mov x3, x5 /* x3 := nrounds */ |
633 | eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */ | | 633 | eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */ |
634 | bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */ | | 634 | bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */ |
635 | eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ | | 635 | eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ |
636 | str q0, [x2], #0x10 /* store ciphertext block */ | | 636 | str q0, [x2], #0x10 /* store ciphertext block */ |
637 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 637 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
638 | subs x10, x10, #0x10 /* count down nbytes */ | | 638 | subs x10, x10, #0x10 /* count down nbytes */ |
639 | b.ne 1b /* repeat if more blocks */ | | 639 | b.ne 1b /* repeat if more blocks */ |
640 | str q31, [x4] /* update tweak */ | | 640 | str q31, [x4] /* update tweak */ |
641 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 641 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
642 | ret | | 642 | ret |
643 | END(aesarmv8_xts_enc1) | | 643 | END(aesarmv8_xts_enc1) |
644 | | | 644 | |
645 | /* | | 645 | /* |
646 | * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1, | | 646 | * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1, |
647 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, | | 647 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, |
648 | * uint32_t nrounds@x5) | | 648 | * uint32_t nrounds@x5) |
649 | * | | 649 | * |
650 | * Encrypt a contiguous sequence of blocks with AES-XTS. | | 650 | * Encrypt a contiguous sequence of blocks with AES-XTS. |
651 | * | | 651 | * |
652 | * nbytes must be a positive integral multiple of 128. | | 652 | * nbytes must be a positive integral multiple of 128. |
653 | * | | 653 | * |
654 | * Standard ABI calling convention. | | 654 | * Standard ABI calling convention. |
655 | */ | | 655 | */ |
656 | ENTRY(aesarmv8_xts_enc8) | | 656 | ENTRY(aesarmv8_xts_enc8) |
657 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 657 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
658 | mov fp, sp | | 658 | mov fp, sp |
659 | mov x9, x0 /* x9 := enckey */ | | 659 | mov x9, x0 /* x9 := enckey */ |
660 | mov x10, x3 /* x10 := nbytes */ | | 660 | mov x10, x3 /* x10 := nbytes */ |
661 | ldr q31, [x4] /* q31 := tweak */ | | 661 | ldr q31, [x4] /* q31 := tweak */ |
662 | 1: mov v24.16b, v31.16b /* q24 := tweak[0] */ | | 662 | 1: mov v24.16b, v31.16b /* q24 := tweak[0] */ |
663 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 663 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
664 | mov v25.16b, v31.16b /* q25 := tweak[1] */ | | 664 | mov v25.16b, v31.16b /* q25 := tweak[1] */ |
665 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 665 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
666 | mov v26.16b, v31.16b /* q26 := tweak[2] */ | | 666 | mov v26.16b, v31.16b /* q26 := tweak[2] */ |
667 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 667 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
668 | mov v27.16b, v31.16b /* q27 := tweak[3] */ | | 668 | mov v27.16b, v31.16b /* q27 := tweak[3] */ |
669 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 669 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
670 | mov v28.16b, v31.16b /* q28 := tweak[4] */ | | 670 | mov v28.16b, v31.16b /* q28 := tweak[4] */ |
671 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 671 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
672 | mov v29.16b, v31.16b /* q29 := tweak[5] */ | | 672 | mov v29.16b, v31.16b /* q29 := tweak[5] */ |
673 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 673 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
674 | mov v30.16b, v31.16b /* q30 := tweak[6] */ | | 674 | mov v30.16b, v31.16b /* q30 := tweak[6] */ |
675 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 675 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
676 | /* q31 := tweak[7] */ | | 676 | /* q31 := tweak[7] */ |
677 | ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */ | | 677 | ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */ |
678 | ldp q2, q3, [x1], #0x20 | | 678 | ldp q2, q3, [x1], #0x20 |
679 | ldp q4, q5, [x1], #0x20 | | 679 | ldp q4, q5, [x1], #0x20 |
680 | ldp q6, q7, [x1], #0x20 | | 680 | ldp q6, q7, [x1], #0x20 |
681 | eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ | | 681 | eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ |
682 | eor v1.16b, v1.16b, v25.16b | | 682 | eor v1.16b, v1.16b, v25.16b |
683 | eor v2.16b, v2.16b, v26.16b | | 683 | eor v2.16b, v2.16b, v26.16b |
684 | eor v3.16b, v3.16b, v27.16b | | 684 | eor v3.16b, v3.16b, v27.16b |
685 | eor v4.16b, v4.16b, v28.16b | | 685 | eor v4.16b, v4.16b, v28.16b |
686 | eor v5.16b, v5.16b, v29.16b | | 686 | eor v5.16b, v5.16b, v29.16b |
687 | eor v6.16b, v6.16b, v30.16b | | 687 | eor v6.16b, v6.16b, v30.16b |
688 | eor v7.16b, v7.16b, v31.16b | | 688 | eor v7.16b, v7.16b, v31.16b |
689 | mov x0, x9 /* x0 := enckey */ | | 689 | mov x0, x9 /* x0 := enckey */ |
690 | mov x3, x5 /* x3 := nrounds */ | | 690 | mov x3, x5 /* x3 := nrounds */ |
691 | bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */ | | 691 | bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */ |
692 | eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ | | 692 | eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ |
693 | eor v1.16b, v1.16b, v25.16b | | 693 | eor v1.16b, v1.16b, v25.16b |
694 | eor v2.16b, v2.16b, v26.16b | | 694 | eor v2.16b, v2.16b, v26.16b |
695 | eor v3.16b, v3.16b, v27.16b | | 695 | eor v3.16b, v3.16b, v27.16b |
696 | eor v4.16b, v4.16b, v28.16b | | 696 | eor v4.16b, v4.16b, v28.16b |
697 | eor v5.16b, v5.16b, v29.16b | | 697 | eor v5.16b, v5.16b, v29.16b |
698 | eor v6.16b, v6.16b, v30.16b | | 698 | eor v6.16b, v6.16b, v30.16b |
699 | eor v7.16b, v7.16b, v31.16b | | 699 | eor v7.16b, v7.16b, v31.16b |
700 | stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ | | 700 | stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ |
701 | stp q2, q3, [x2], #0x20 | | 701 | stp q2, q3, [x2], #0x20 |
702 | stp q4, q5, [x2], #0x20 | | 702 | stp q4, q5, [x2], #0x20 |
703 | stp q6, q7, [x2], #0x20 | | 703 | stp q6, q7, [x2], #0x20 |
704 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 704 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
705 | subs x10, x10, #0x80 /* count down nbytes */ | | 705 | subs x10, x10, #0x80 /* count down nbytes */ |
706 | b.ne 1b /* repeat if more block groups */ | | 706 | b.ne 1b /* repeat if more block groups */ |
707 | str q31, [x4] /* update tweak */ | | 707 | str q31, [x4] /* update tweak */ |
708 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 708 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
709 | ret | | 709 | ret |
710 | END(aesarmv8_xts_enc8) | | 710 | END(aesarmv8_xts_enc8) |
711 | | | 711 | |
712 | /* | | 712 | /* |
713 | * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, | | 713 | * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, |
714 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, | | 714 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, |
715 | * uint32_t nrounds@x5) | | 715 | * uint32_t nrounds@x5) |
716 | * | | 716 | * |
717 | * Decrypt a contiguous sequdece of blocks with AES-XTS. | | 717 | * Decrypt a contiguous sequdece of blocks with AES-XTS. |
718 | * | | 718 | * |
719 | * nbytes must be a positive integral multiple of 16. This routine | | 719 | * nbytes must be a positive integral multiple of 16. This routine |
720 | * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. | | 720 | * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. |
721 | * | | 721 | * |
722 | * Standard ABI calling convention. | | 722 | * Standard ABI calling convention. |
723 | */ | | 723 | */ |
724 | ENTRY(aesarmv8_xts_dec1) | | 724 | ENTRY(aesarmv8_xts_dec1) |
725 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 725 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
726 | mov fp, sp | | 726 | mov fp, sp |
727 | mov x9, x0 /* x9 := deckey */ | | 727 | mov x9, x0 /* x9 := deckey */ |
728 | mov x10, x3 /* x10 := nbytes */ | | 728 | mov x10, x3 /* x10 := nbytes */ |
729 | ldr q31, [x4] /* q31 := tweak */ | | 729 | ldr q31, [x4] /* q31 := tweak */ |
730 | 1: ldr q0, [x1], #0x10 /* q0 := ctxt */ | | 730 | 1: ldr q0, [x1], #0x10 /* q0 := ctxt */ |
731 | mov x0, x9 /* x0 := deckey */ | | 731 | mov x0, x9 /* x0 := deckey */ |
732 | mov x3, x5 /* x3 := nrounds */ | | 732 | mov x3, x5 /* x3 := nrounds */ |
733 | eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */ | | 733 | eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */ |
734 | bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */ | | 734 | bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */ |
735 | eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */ | | 735 | eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */ |
736 | str q0, [x2], #0x10 /* store plaintext block */ | | 736 | str q0, [x2], #0x10 /* store plaintext block */ |
737 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 737 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
738 | subs x10, x10, #0x10 /* count down nbytes */ | | 738 | subs x10, x10, #0x10 /* count down nbytes */ |
739 | b.ne 1b /* repeat if more blocks */ | | 739 | b.ne 1b /* repeat if more blocks */ |
740 | str q31, [x4] /* update tweak */ | | 740 | str q31, [x4] /* update tweak */ |
741 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 741 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
742 | ret | | 742 | ret |
743 | END(aesarmv8_xts_dec1) | | 743 | END(aesarmv8_xts_dec1) |
744 | | | 744 | |
745 | /* | | 745 | /* |
746 | * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, | | 746 | * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, |
747 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, | | 747 | * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, |
748 | * uint32_t nrounds@x5) | | 748 | * uint32_t nrounds@x5) |
749 | * | | 749 | * |
750 | * Decrypt a contiguous sequdece of blocks with AES-XTS. | | 750 | * Decrypt a contiguous sequdece of blocks with AES-XTS. |
751 | * | | 751 | * |
752 | * nbytes must be a positive integral multiple of 128. | | 752 | * nbytes must be a positive integral multiple of 128. |
753 | * | | 753 | * |
754 | * Standard ABI calling convention. | | 754 | * Standard ABI calling convention. |
755 | */ | | 755 | */ |
756 | ENTRY(aesarmv8_xts_dec8) | | 756 | ENTRY(aesarmv8_xts_dec8) |
757 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 757 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
758 | mov fp, sp | | 758 | mov fp, sp |
759 | mov x9, x0 /* x9 := deckey */ | | 759 | mov x9, x0 /* x9 := deckey */ |
760 | mov x10, x3 /* x10 := nbytes */ | | 760 | mov x10, x3 /* x10 := nbytes */ |
761 | ldr q31, [x4] /* q31 := tweak */ | | 761 | ldr q31, [x4] /* q31 := tweak */ |
762 | 1: mov v24.16b, v31.16b /* q24 := tweak[0] */ | | 762 | 1: mov v24.16b, v31.16b /* q24 := tweak[0] */ |
763 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 763 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
764 | mov v25.16b, v31.16b /* q25 := tweak[1] */ | | 764 | mov v25.16b, v31.16b /* q25 := tweak[1] */ |
765 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 765 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
766 | mov v26.16b, v31.16b /* q26 := tweak[2] */ | | 766 | mov v26.16b, v31.16b /* q26 := tweak[2] */ |
767 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 767 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
768 | mov v27.16b, v31.16b /* q27 := tweak[3] */ | | 768 | mov v27.16b, v31.16b /* q27 := tweak[3] */ |
769 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 769 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
770 | mov v28.16b, v31.16b /* q28 := tweak[4] */ | | 770 | mov v28.16b, v31.16b /* q28 := tweak[4] */ |
771 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 771 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
772 | mov v29.16b, v31.16b /* q29 := tweak[5] */ | | 772 | mov v29.16b, v31.16b /* q29 := tweak[5] */ |
773 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 773 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
774 | mov v30.16b, v31.16b /* q30 := tweak[6] */ | | 774 | mov v30.16b, v31.16b /* q30 := tweak[6] */ |
775 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 775 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
776 | /* q31 := tweak[7] */ | | 776 | /* q31 := tweak[7] */ |
777 | ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */ | | 777 | ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */ |
778 | ldp q2, q3, [x1], #0x20 | | 778 | ldp q2, q3, [x1], #0x20 |
779 | ldp q4, q5, [x1], #0x20 | | 779 | ldp q4, q5, [x1], #0x20 |
780 | ldp q6, q7, [x1], #0x20 | | 780 | ldp q6, q7, [x1], #0x20 |
781 | eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */ | | 781 | eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */ |
782 | eor v1.16b, v1.16b, v25.16b | | 782 | eor v1.16b, v1.16b, v25.16b |
783 | eor v2.16b, v2.16b, v26.16b | | 783 | eor v2.16b, v2.16b, v26.16b |
784 | eor v3.16b, v3.16b, v27.16b | | 784 | eor v3.16b, v3.16b, v27.16b |
785 | eor v4.16b, v4.16b, v28.16b | | 785 | eor v4.16b, v4.16b, v28.16b |
786 | eor v5.16b, v5.16b, v29.16b | | 786 | eor v5.16b, v5.16b, v29.16b |
787 | eor v6.16b, v6.16b, v30.16b | | 787 | eor v6.16b, v6.16b, v30.16b |
788 | eor v7.16b, v7.16b, v31.16b | | 788 | eor v7.16b, v7.16b, v31.16b |
789 | mov x0, x9 /* x0 := deckey */ | | 789 | mov x0, x9 /* x0 := deckey */ |
790 | mov x3, x5 /* x3 := nrounds */ | | 790 | mov x3, x5 /* x3 := nrounds */ |
791 | bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */ | | 791 | bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */ |
792 | eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ | | 792 | eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ |
793 | eor v1.16b, v1.16b, v25.16b | | 793 | eor v1.16b, v1.16b, v25.16b |
794 | eor v2.16b, v2.16b, v26.16b | | 794 | eor v2.16b, v2.16b, v26.16b |
795 | eor v3.16b, v3.16b, v27.16b | | 795 | eor v3.16b, v3.16b, v27.16b |
796 | eor v4.16b, v4.16b, v28.16b | | 796 | eor v4.16b, v4.16b, v28.16b |
797 | eor v5.16b, v5.16b, v29.16b | | 797 | eor v5.16b, v5.16b, v29.16b |
798 | eor v6.16b, v6.16b, v30.16b | | 798 | eor v6.16b, v6.16b, v30.16b |
799 | eor v7.16b, v7.16b, v31.16b | | 799 | eor v7.16b, v7.16b, v31.16b |
800 | stp q0, q1, [x2], #0x20 /* store plaintext blocks */ | | 800 | stp q0, q1, [x2], #0x20 /* store plaintext blocks */ |
801 | stp q2, q3, [x2], #0x20 | | 801 | stp q2, q3, [x2], #0x20 |
802 | stp q4, q5, [x2], #0x20 | | 802 | stp q4, q5, [x2], #0x20 |
803 | stp q6, q7, [x2], #0x20 | | 803 | stp q6, q7, [x2], #0x20 |
804 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 804 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
805 | subs x10, x10, #0x80 /* count down nbytes */ | | 805 | subs x10, x10, #0x80 /* count down nbytes */ |
806 | b.ne 1b /* repeat if more block groups */ | | 806 | b.ne 1b /* repeat if more block groups */ |
807 | str q31, [x4] /* update tweak */ | | 807 | str q31, [x4] /* update tweak */ |
808 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 808 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
809 | ret | | 809 | ret |
810 | END(aesarmv8_xts_dec8) | | 810 | END(aesarmv8_xts_dec8) |
811 | | | 811 | |
812 | /* | | 812 | /* |
813 | * aesarmv8_xts_mulx(tweak@q31) | | 813 | * aesarmv8_xts_mulx(tweak@q31) |
814 | * | | 814 | * |
815 | * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. | | 815 | * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. |
816 | * Uses x0 and q0/q1 as temporaries. | | 816 | * Uses x0 and q0/q1 as temporaries. |
817 | */ | | 817 | */ |
818 | .text | | 818 | .text |
819 | _ALIGN_TEXT | | 819 | _ALIGN_TEXT |
820 | .type aesarmv8_xts_mulx,@function | | 820 | .type aesarmv8_xts_mulx,@function |
821 | aesarmv8_xts_mulx: | | 821 | aesarmv8_xts_mulx: |
822 | /* | | 822 | /* |
823 | * Simultaneously determine | | 823 | * Simultaneously determine |
824 | * (a) whether the high bit of the low half must be | | 824 | * (a) whether the high bit of the low half must be |
825 | * shifted into the low bit of the high half, and | | 825 | * shifted into the low bit of the high half, and |
826 | * (b) whether the high bit of the high half must be | | 826 | * (b) whether the high bit of the high half must be |
827 | * carried into x^128 = x^7 + x^2 + x + 1. | | 827 | * carried into x^128 = x^7 + x^2 + x + 1. |
828 | */ | | 828 | */ |
829 | adrl x0, xtscarry | | 829 | adrl x0, xtscarry |
830 | cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ | | 830 | cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ |
831 | ldr q0, [x0] /* q0 := xtscarry */ | | 831 | ldr q0, [x0] /* q0 := xtscarry */ |
832 | ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ | | 832 | ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ |
833 | shl v31.2d, v31.2d, #1 /* shift */ | | 833 | shl v31.2d, v31.2d, #1 /* shift */ |
834 | and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ | | 834 | and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ |
835 | eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */ | | 835 | eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */ |
836 | ret | | 836 | ret |
837 | END(aesarmv8_xts_mulx) | | 837 | END(aesarmv8_xts_mulx) |
838 | | | 838 | |
839 | .section .rodata | | 839 | .section .rodata |
840 | .p2align 4 | | 840 | .p2align 4 |
841 | .type xtscarry,@object | | 841 | .type xtscarry,@object |
842 | xtscarry: | | 842 | xtscarry: |
843 | .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 | | 843 | .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 |
844 | END(xtscarry) | | 844 | END(xtscarry) |
845 | | | 845 | |
846 | /* | | 846 | /* |
847 | * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1) | | 847 | * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1) |
848 | * | | 848 | * |
849 | * Update an AES-XTS tweak. | | 849 | * Update an AES-XTS tweak. |
850 | * | | 850 | * |
851 | * Standard ABI calling convention. | | 851 | * Standard ABI calling convention. |
852 | */ | | 852 | */ |
853 | ENTRY(aesarmv8_xts_update) | | 853 | ENTRY(aesarmv8_xts_update) |
854 | stp fp, lr, [sp, #-16]! /* push stack frame */ | | 854 | stp fp, lr, [sp, #-16]! /* push stack frame */ |
855 | mov fp, sp | | 855 | mov fp, sp |
856 | ldr q31, [x0] /* load tweak */ | | 856 | ldr q31, [x0] /* load tweak */ |
857 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ | | 857 | bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ |
858 | str q31, [x1] /* store tweak */ | | 858 | str q31, [x1] /* store tweak */ |
859 | ldp fp, lr, [sp], #16 /* pop stack frame */ | | 859 | ldp fp, lr, [sp], #16 /* pop stack frame */ |
860 | ret | | 860 | ret |
861 | END(aesarmv8_xts_update) | | 861 | END(aesarmv8_xts_update) |
862 | | | 862 | |
863 | /* | | 863 | /* |
864 | * aesarmv8_enc1(const struct aesenc *enckey@x0, | | 864 | * aesarmv8_enc1(const struct aesenc *enckey@x0, |
865 | * uint128_t block@q0, uint32_t nrounds@x3) | | 865 | * uint128_t block@q0, uint32_t nrounds@x3) |
866 | * | | 866 | * |
867 | * Encrypt a single AES block in q0. | | 867 | * Encrypt a single AES block in q0. |
868 | * | | 868 | * |
869 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. | | 869 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. |
870 | */ | | 870 | */ |
871 | .text | | 871 | .text |
872 | _ALIGN_TEXT | | 872 | _ALIGN_TEXT |
873 | .type aesarmv8_enc1,@function | | 873 | .type aesarmv8_enc1,@function |
874 | aesarmv8_enc1: | | 874 | aesarmv8_enc1: |
875 | ldr q16, [x0], #0x10 /* load round key */ | | 875 | ldr q16, [x0], #0x10 /* load round key */ |
876 | 1: subs x3, x3, #1 | | 876 | 1: subs x3, x3, #1 |
877 | /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ | | 877 | /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ |
878 | aese v0.16b, v16.16b | | 878 | aese v0.16b, v16.16b |
879 | ldr q16, [x0], #0x10 /* load next round key */ | | 879 | ldr q16, [x0], #0x10 /* load next round key */ |
880 | b.eq 2f | | 880 | b.eq 2f |
881 | /* q0 := MixColumns(q0) */ | | 881 | /* q0 := MixColumns(q0) */ |
882 | aesmc v0.16b, v0.16b | | 882 | aesmc v0.16b, v0.16b |
883 | b 1b | | 883 | b 1b |
884 | 2: eor v0.16b, v0.16b, v16.16b | | 884 | 2: eor v0.16b, v0.16b, v16.16b |
885 | ret | | 885 | ret |
886 | END(aesarmv8_enc1) | | 886 | END(aesarmv8_enc1) |
887 | | | 887 | |
888 | /* | | 888 | /* |
889 | * aesarmv8_enc8(const struct aesenc *enckey@x0, | | 889 | * aesarmv8_enc8(const struct aesenc *enckey@x0, |
890 | * uint128_t block0@q0, ..., uint128_t block7@q7, | | 890 | * uint128_t block0@q0, ..., uint128_t block7@q7, |
891 | * uint32_t nrounds@x3) | | 891 | * uint32_t nrounds@x3) |
892 | * | | 892 | * |
893 | * Encrypt eight AES blocks in q0 through q7 in parallel. | | 893 | * Encrypt eight AES blocks in q0 through q7 in parallel. |
894 | * | | 894 | * |
895 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. | | 895 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. |
896 | */ | | 896 | */ |
897 | .text | | 897 | .text |
898 | _ALIGN_TEXT | | 898 | _ALIGN_TEXT |
899 | .type aesarmv8_enc8,@function | | 899 | .type aesarmv8_enc8,@function |
900 | aesarmv8_enc8: | | 900 | aesarmv8_enc8: |
901 | ldr q16, [x0], #0x10 /* load round key */ | | 901 | ldr q16, [x0], #0x10 /* load round key */ |
902 | 1: subs x3, x3, #1 | | 902 | 1: subs x3, x3, #1 |
903 | /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ | | 903 | /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ |
904 | aese v0.16b, v16.16b | | 904 | aese v0.16b, v16.16b |
905 | aese v1.16b, v16.16b | | 905 | aese v1.16b, v16.16b |
906 | aese v2.16b, v16.16b | | 906 | aese v2.16b, v16.16b |
907 | aese v3.16b, v16.16b | | 907 | aese v3.16b, v16.16b |
908 | aese v4.16b, v16.16b | | 908 | aese v4.16b, v16.16b |
909 | aese v5.16b, v16.16b | | 909 | aese v5.16b, v16.16b |
910 | aese v6.16b, v16.16b | | 910 | aese v6.16b, v16.16b |
911 | aese v7.16b, v16.16b | | 911 | aese v7.16b, v16.16b |
912 | ldr q16, [x0], #0x10 /* load next round key */ | | 912 | ldr q16, [x0], #0x10 /* load next round key */ |
913 | b.eq 2f | | 913 | b.eq 2f |
914 | /* q[i] := MixColumns(q[i]) */ | | 914 | /* q[i] := MixColumns(q[i]) */ |
915 | aesmc v0.16b, v0.16b | | 915 | aesmc v0.16b, v0.16b |
916 | aesmc v1.16b, v1.16b | | 916 | aesmc v1.16b, v1.16b |
917 | aesmc v2.16b, v2.16b | | 917 | aesmc v2.16b, v2.16b |
918 | aesmc v3.16b, v3.16b | | 918 | aesmc v3.16b, v3.16b |
919 | aesmc v4.16b, v4.16b | | 919 | aesmc v4.16b, v4.16b |
920 | aesmc v5.16b, v5.16b | | 920 | aesmc v5.16b, v5.16b |
921 | aesmc v6.16b, v6.16b | | 921 | aesmc v6.16b, v6.16b |
922 | aesmc v7.16b, v7.16b | | 922 | aesmc v7.16b, v7.16b |
923 | b 1b | | 923 | b 1b |
924 | 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ | | 924 | 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ |
925 | eor v1.16b, v1.16b, v16.16b | | 925 | eor v1.16b, v1.16b, v16.16b |
926 | eor v2.16b, v2.16b, v16.16b | | 926 | eor v2.16b, v2.16b, v16.16b |
927 | eor v3.16b, v3.16b, v16.16b | | 927 | eor v3.16b, v3.16b, v16.16b |
928 | eor v4.16b, v4.16b, v16.16b | | 928 | eor v4.16b, v4.16b, v16.16b |
929 | eor v5.16b, v5.16b, v16.16b | | 929 | eor v5.16b, v5.16b, v16.16b |
930 | eor v6.16b, v6.16b, v16.16b | | 930 | eor v6.16b, v6.16b, v16.16b |
931 | eor v7.16b, v7.16b, v16.16b | | 931 | eor v7.16b, v7.16b, v16.16b |
932 | ret | | 932 | ret |
933 | END(aesarmv8_enc8) | | 933 | END(aesarmv8_enc8) |
934 | | | 934 | |
935 | /* | | 935 | /* |
936 | * aesarmv8_dec1(const struct aesdec *deckey@x0, | | 936 | * aesarmv8_dec1(const struct aesdec *deckey@x0, |
937 | * uint128_t block@q0, uint32_t nrounds@x3) | | 937 | * uint128_t block@q0, uint32_t nrounds@x3) |
938 | * | | 938 | * |
939 | * Decrypt a single AES block in q0. | | 939 | * Decrypt a single AES block in q0. |
940 | * | | 940 | * |
941 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. | | 941 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. |
942 | */ | | 942 | */ |
943 | .text | | 943 | .text |
944 | _ALIGN_TEXT | | 944 | _ALIGN_TEXT |
945 | .type aesarmv8_dec1,@function | | 945 | .type aesarmv8_dec1,@function |
946 | aesarmv8_dec1: | | 946 | aesarmv8_dec1: |
947 | ldr q16, [x0], #0x10 /* load round key */ | | 947 | ldr q16, [x0], #0x10 /* load round key */ |
948 | 1: subs x3, x3, #1 | | 948 | 1: subs x3, x3, #1 |
949 | /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ | | 949 | /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ |
950 | aesd v0.16b, v16.16b | | 950 | aesd v0.16b, v16.16b |
951 | ldr q16, [x0], #0x10 /* load next round key */ | | 951 | ldr q16, [x0], #0x10 /* load next round key */ |
952 | b.eq 2f | | 952 | b.eq 2f |
953 | /* q0 := InMixColumns(q0) */ | | 953 | /* q0 := InMixColumns(q0) */ |
954 | aesimc v0.16b, v0.16b | | 954 | aesimc v0.16b, v0.16b |
955 | b 1b | | 955 | b 1b |
956 | 2: eor v0.16b, v0.16b, v16.16b | | 956 | 2: eor v0.16b, v0.16b, v16.16b |
957 | ret | | 957 | ret |
958 | END(aesarmv8_dec1) | | 958 | END(aesarmv8_dec1) |
959 | | | 959 | |
960 | /* | | 960 | /* |
961 | * aesarmv8_dec8(const struct aesdec *deckey@x0, | | 961 | * aesarmv8_dec8(const struct aesdec *deckey@x0, |
962 | * uint128_t block0@q0, ..., uint128_t block7@q7, | | 962 | * uint128_t block0@q0, ..., uint128_t block7@q7, |
963 | * uint32_t nrounds@x3) | | 963 | * uint32_t nrounds@x3) |
964 | * | | 964 | * |
965 | * Decrypt eight AES blocks in q0 through q7 in parallel. | | 965 | * Decrypt eight AES blocks in q0 through q7 in parallel. |
966 | * | | 966 | * |
967 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. | | 967 | * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. |
968 | */ | | 968 | */ |
969 | .text | | 969 | .text |
970 | _ALIGN_TEXT | | 970 | _ALIGN_TEXT |
971 | .type aesarmv8_dec8,@function | | 971 | .type aesarmv8_dec8,@function |
972 | aesarmv8_dec8: | | 972 | aesarmv8_dec8: |
973 | ldr q16, [x0], #0x10 /* load round key */ | | 973 | ldr q16, [x0], #0x10 /* load round key */ |
974 | 1: subs x3, x3, #1 | | 974 | 1: subs x3, x3, #1 |
975 | /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ | | 975 | /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ |
976 | aesd v0.16b, v16.16b | | 976 | aesd v0.16b, v16.16b |
977 | aesd v1.16b, v16.16b | | 977 | aesd v1.16b, v16.16b |
978 | aesd v2.16b, v16.16b | | 978 | aesd v2.16b, v16.16b |
979 | aesd v3.16b, v16.16b | | 979 | aesd v3.16b, v16.16b |
980 | aesd v4.16b, v16.16b | | 980 | aesd v4.16b, v16.16b |
981 | aesd v5.16b, v16.16b | | 981 | aesd v5.16b, v16.16b |
982 | aesd v6.16b, v16.16b | | 982 | aesd v6.16b, v16.16b |
983 | aesd v7.16b, v16.16b | | 983 | aesd v7.16b, v16.16b |
984 | ldr q16, [x0], #0x10 /* load next round key */ | | 984 | ldr q16, [x0], #0x10 /* load next round key */ |
985 | b.eq 2f | | 985 | b.eq 2f |
986 | /* q[i] := InMixColumns(q[i]) */ | | 986 | /* q[i] := InMixColumns(q[i]) */ |
987 | aesimc v0.16b, v0.16b | | 987 | aesimc v0.16b, v0.16b |
988 | aesimc v1.16b, v1.16b | | 988 | aesimc v1.16b, v1.16b |
989 | aesimc v2.16b, v2.16b | | 989 | aesimc v2.16b, v2.16b |
990 | aesimc v3.16b, v3.16b | | 990 | aesimc v3.16b, v3.16b |
991 | aesimc v4.16b, v4.16b | | 991 | aesimc v4.16b, v4.16b |
992 | aesimc v5.16b, v5.16b | | 992 | aesimc v5.16b, v5.16b |
993 | aesimc v6.16b, v6.16b | | 993 | aesimc v6.16b, v6.16b |
994 | aesimc v7.16b, v7.16b | | 994 | aesimc v7.16b, v7.16b |
995 | b 1b | | 995 | b 1b |
996 | 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ | | 996 | 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */ |
997 | eor v1.16b, v1.16b, v16.16b | | 997 | eor v1.16b, v1.16b, v16.16b |
998 | eor v2.16b, v2.16b, v16.16b | | 998 | eor v2.16b, v2.16b, v16.16b |
999 | eor v3.16b, v3.16b, v16.16b | | 999 | eor v3.16b, v3.16b, v16.16b |
1000 | eor v4.16b, v4.16b, v16.16b | | 1000 | eor v4.16b, v4.16b, v16.16b |
1001 | eor v5.16b, v5.16b, v16.16b | | 1001 | eor v5.16b, v5.16b, v16.16b |
1002 | eor v6.16b, v6.16b, v16.16b | | 1002 | eor v6.16b, v6.16b, v16.16b |
1003 | eor v7.16b, v7.16b, v16.16b | | 1003 | eor v7.16b, v7.16b, v16.16b |
1004 | ret | | 1004 | ret |
1005 | END(aesarmv8_dec8) | | 1005 | END(aesarmv8_dec8) |