Draft 2x vectorized neon vpaes for aarch64. Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in cgd tests, for parallelizable operations like CBC decryption; same improvement should probably carry over to rpi4 CPU which lacks ARMv8.0-AES.diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c
(riastradh)
--- src/sys/crypto/aes/arch/arm/aes_neon.c 2020/06/30 20:32:11 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon.c 2020/07/28 20:11:09 1.4
@@ -1,631 +1,740 @@ | @@ -1,631 +1,740 @@ | |||
1 | /* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */ | 1 | /* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
15 | * | 15 | * | |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
26 | * POSSIBILITY OF SUCH DAMAGE. | 26 | * POSSIBILITY OF SUCH DAMAGE. | |
27 | */ | 27 | */ | |
28 | 28 | |||
29 | /* | 29 | /* | |
30 | * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES | 30 | * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES | |
31 | * software, at <https://crypto.stanford.edu/vpaes/>, described in | 31 | * software, at <https://crypto.stanford.edu/vpaes/>, described in | |
32 | * | 32 | * | |
33 | * Mike Hamburg, `Accelerating AES with Vector Permute | 33 | * Mike Hamburg, `Accelerating AES with Vector Permute | |
34 | * Instructions', in Christophe Clavier and Kris Gaj (eds.), | 34 | * Instructions', in Christophe Clavier and Kris Gaj (eds.), | |
35 | * Cryptographic Hardware and Embedded Systems -- CHES 2009, | 35 | * Cryptographic Hardware and Embedded Systems -- CHES 2009, | |
36 | * Springer LNCS 5747, pp. 18-32. | 36 | * Springer LNCS 5747, pp. 18-32. | |
37 | * | 37 | * | |
38 | * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 | 38 | * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 | |
39 | */ | 39 | */ | |
40 | 40 | |||
41 | #include <sys/cdefs.h> | 41 | #include <sys/cdefs.h> | |
42 | __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $"); | 42 | __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); | |
43 | 43 | |||
44 | #include <sys/types.h> | 44 | #include <sys/types.h> | |
45 | 45 | |||
46 | #ifdef _KERNEL | 46 | #ifdef _KERNEL | |
47 | #include <sys/systm.h> | 47 | #include <sys/systm.h> | |
48 | #else | 48 | #else | |
49 | #include <err.h> | 49 | #include <err.h> | |
50 | #define panic(fmt, args...) err(1, fmt, ##args) | 50 | #define panic(fmt, args...) err(1, fmt, ##args) | |
51 | #endif | 51 | #endif | |
52 | 52 | |||
53 | #include "aes_neon_impl.h" | 53 | #include "aes_neon_impl.h" | |
54 | 54 | |||
55 | #ifdef __aarch64__ | 55 | #ifdef __aarch64__ | |
56 | #define __aarch64_used | 56 | #define __aarch64_used | |
57 | #else | 57 | #else | |
58 | #define __aarch64_used __unused | 58 | #define __aarch64_used __unused | |
59 | #endif | 59 | #endif | |
60 | 60 | |||
61 | static const uint8x16_t | 61 | static const uint8x16_t | |
62 | mc_forward[4] = { | 62 | mc_forward[4] = { | |
63 | {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, | 63 | {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, | |
64 | 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C}, | 64 | 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C}, | |
65 | {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, | 65 | {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, | |
66 | 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00}, | 66 | 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00}, | |
67 | {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, | 67 | {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, | |
68 | 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04}, | 68 | 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04}, | |
69 | {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, | 69 | {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, | |
70 | 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08}, | 70 | 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08}, | |
71 | }, | 71 | }, | |
72 | mc_backward[4] __aarch64_used = { | 72 | mc_backward[4] __aarch64_used = { | |
73 | {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, | 73 | {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, | |
74 | 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E}, | 74 | 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E}, | |
75 | {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, | 75 | {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, | |
76 | 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A}, | 76 | 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A}, | |
77 | {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, | 77 | {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, | |
78 | 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06}, | 78 | 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06}, | |
79 | {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, | 79 | {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, | |
80 | 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02}, | 80 | 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02}, | |
81 | }, | 81 | }, | |
82 | ipt[2] __aarch64_used = { | 82 | ipt[2] __aarch64_used = { | |
83 | {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, | 83 | {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, | |
84 | 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA}, | 84 | 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA}, | |
85 | {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, | 85 | {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, | |
86 | 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD}, | 86 | 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD}, | |
87 | }, | 87 | }, | |
88 | opt[2] = { | 88 | opt[2] = { | |
89 | {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, | 89 | {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, | |
90 | 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7}, | 90 | 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7}, | |
91 | {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, | 91 | {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, | |
92 | 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1}, | 92 | 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1}, | |
93 | }, | 93 | }, | |
94 | dipt[2] __aarch64_used = { | 94 | dipt[2] __aarch64_used = { | |
95 | {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, | 95 | {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, | |
96 | 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15}, | 96 | 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15}, | |
97 | {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, | 97 | {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, | |
98 | 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12}, | 98 | 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12}, | |
99 | }, | 99 | }, | |
100 | sb1[2] __aarch64_used = { | 100 | sb1[2] __aarch64_used = { | |
101 | {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, | 101 | {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, | |
102 | 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5}, | 102 | 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5}, | |
103 | {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, | 103 | {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, | |
104 | 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B}, | 104 | 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B}, | |
105 | }, | 105 | }, | |
106 | sb2[2] __aarch64_used = { | 106 | sb2[2] __aarch64_used = { | |
107 | {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, | 107 | {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, | |
108 | 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E}, | 108 | 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E}, | |
109 | {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, | 109 | {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, | |
110 | 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2}, | 110 | 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2}, | |
111 | }, | 111 | }, | |
112 | sbo[2] __aarch64_used = { | 112 | sbo[2] __aarch64_used = { | |
113 | {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, | 113 | {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, | |
114 | 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15}, | 114 | 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15}, | |
115 | {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, | 115 | {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, | |
116 | 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E}, | 116 | 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E}, | |
117 | }, | 117 | }, | |
118 | dsb9[2] __aarch64_used = { | 118 | dsb9[2] __aarch64_used = { | |
119 | {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, | 119 | {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, | |
120 | 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA}, | 120 | 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA}, | |
121 | {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, | 121 | {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, | |
122 | 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72}, | 122 | 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72}, | |
123 | }, | 123 | }, | |
124 | dsbd[2] __aarch64_used = { | 124 | dsbd[2] __aarch64_used = { | |
125 | {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, | 125 | {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, | |
126 | 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5}, | 126 | 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5}, | |
127 | {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, | 127 | {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, | |
128 | 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29}, | 128 | 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29}, | |
129 | }, | 129 | }, | |
130 | dsbb[2] __aarch64_used = { | 130 | dsbb[2] __aarch64_used = { | |
131 | {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, | 131 | {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, | |
132 | 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60}, | 132 | 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60}, | |
133 | {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, | 133 | {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, | |
134 | 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3}, | 134 | 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3}, | |
135 | }, | 135 | }, | |
136 | dsbe[2] __aarch64_used = { | 136 | dsbe[2] __aarch64_used = { | |
137 | {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, | 137 | {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, | |
138 | 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22}, | 138 | 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22}, | |
139 | {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, | 139 | {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, | |
140 | 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94}, | 140 | 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94}, | |
141 | }, | 141 | }, | |
142 | dsbo[2] __aarch64_used = { | 142 | dsbo[2] __aarch64_used = { | |
143 | {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, | 143 | {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, | |
144 | 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7}, | 144 | 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7}, | |
145 | {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, | 145 | {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, | |
146 | 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA}, | 146 | 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA}, | |
147 | }, | 147 | }, | |
148 | dks1[2] = { | 148 | dks1[2] = { | |
149 | {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, | 149 | {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, | |
150 | 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A}, | 150 | 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A}, | |
151 | {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, | 151 | {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, | |
152 | 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B}, | 152 | 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B}, | |
153 | }, | 153 | }, | |
154 | dks2[2] = { | 154 | dks2[2] = { | |
155 | {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, | 155 | {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, | |
156 | 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46}, | 156 | 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46}, | |
157 | {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, | 157 | {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, | |
158 | 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73}, | 158 | 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73}, | |
159 | }, | 159 | }, | |
160 | dks3[2] = { | 160 | dks3[2] = { | |
161 | {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, | 161 | {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, | |
162 | 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8}, | 162 | 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8}, | |
163 | {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, | 163 | {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, | |
164 | 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5}, | 164 | 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5}, | |
165 | }, | 165 | }, | |
166 | dks4[2] = { | 166 | dks4[2] = { | |
167 | {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, | 167 | {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, | |
168 | 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0}, | 168 | 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0}, | |
169 | {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, | 169 | {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, | |
170 | 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F}, | 170 | 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F}, | |
171 | }, | 171 | }, | |
172 | deskew[2] = { | 172 | deskew[2] = { | |
173 | {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, | 173 | {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, | |
174 | 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D}, | 174 | 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D}, | |
175 | {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, | 175 | {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, | |
176 | 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28}, | 176 | 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28}, | |
177 | }, | 177 | }, | |
178 | sr[4] __aarch64_used = { | 178 | sr[4] __aarch64_used = { | |
179 | {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, | 179 | {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, | |
180 | 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}, | 180 | 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}, | |
181 | {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, | 181 | {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, | |
182 | 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B}, | 182 | 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B}, | |
183 | {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, | 183 | {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, | |
184 | 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07}, | 184 | 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07}, | |
185 | {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, | 185 | {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, | |
186 | 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03}, | 186 | 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03}, | |
187 | }, | 187 | }, | |
188 | rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, | 188 | rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, | |
189 | 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70}, | 189 | 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70}, | |
190 | s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, | 190 | s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, | |
191 | 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B}, | 191 | 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B}, | |
192 | of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, | 192 | of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, | |
193 | 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F}, | 193 | 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F}, | |
194 | inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, | 194 | inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, | |
195 | 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04}, | 195 | 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04}, | |
196 | inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, | 196 | inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, | |
197 | 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03}; | 197 | 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03}; | |
198 | 198 | |||
199 | static inline uint8x16_t | 199 | static inline uint8x16_t | |
200 | loadroundkey(const void *rkp) | 200 | loadroundkey(const void *rkp) | |
201 | { | 201 | { | |
202 | return vld1q_u8(rkp); | 202 | return vld1q_u8(rkp); | |
203 | } | 203 | } | |
204 | 204 | |||
205 | static inline void | 205 | static inline void | |
206 | storeroundkey(void *rkp, uint8x16_t rk) | 206 | storeroundkey(void *rkp, uint8x16_t rk) | |
207 | { | 207 | { | |
208 | vst1q_u8(rkp, rk); | 208 | vst1q_u8(rkp, rk); | |
209 | } | 209 | } | |
210 | 210 | |||
211 | /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ | 211 | /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ | |
212 | static inline void | 212 | static inline void | |
213 | bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x) | 213 | bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x) | |
214 | { | 214 | { | |
215 | 215 | |||
216 | *lo = of & x; | 216 | *lo = of & x; | |
217 | *hi = of & vshrq_n_u8(x, 4); | 217 | *hi = of & vshrq_n_u8(x, 4); | |
218 | } | 218 | } | |
219 | 219 | |||
220 | /* | 220 | /* | |
221 | * t is a pair of maps respectively from low and high nybbles to bytes. | 221 | * t is a pair of maps respectively from low and high nybbles to bytes. | |
222 | * Apply t the nybbles, and add the results in GF(2). | 222 | * Apply t the nybbles, and add the results in GF(2). | |
223 | */ | 223 | */ | |
224 | static uint8x16_t | 224 | static uint8x16_t | |
225 | aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2]) | 225 | aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2]) | |
226 | { | 226 | { | |
227 | uint8x16_t lo, hi; | 227 | uint8x16_t lo, hi; | |
228 | 228 | |||
229 | bytes2nybbles(&lo, &hi, x); | 229 | bytes2nybbles(&lo, &hi, x); | |
230 | return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi); | 230 | return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi); | |
231 | } | 231 | } | |
232 | 232 | |||
233 | static inline void | 233 | static inline void | |
234 | subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_, | 234 | subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_, | |
235 | uint8x16_t inva_) | 235 | uint8x16_t inva_) | |
236 | { | 236 | { | |
237 | uint8x16_t k, i, ak, j; | 237 | uint8x16_t k, i, ak, j; | |
238 | 238 | |||
239 | bytes2nybbles(&k, &i, x); | 239 | bytes2nybbles(&k, &i, x); | |
240 | ak = vqtbl1q_u8(inva_, k); | 240 | ak = vqtbl1q_u8(inva_, k); | |
241 | j = i ^ k; | 241 | j = i ^ k; | |
242 | *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i)); | 242 | *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i)); | |
243 | *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j)); | 243 | *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j)); | |
244 | } | 244 | } | |
245 | 245 | |||
246 | static uint8x16_t | 246 | static uint8x16_t | |
247 | aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk) | 247 | aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk) | |
248 | { | 248 | { | |
249 | uint8x16_t io, jo; | 249 | uint8x16_t io, jo; | |
250 | 250 | |||
251 | /* smear prk */ | 251 | /* smear prk */ | |
252 | prk ^= vextq_u8(vdupq_n_u8(0), prk, 12); | 252 | prk ^= vextq_u8(vdupq_n_u8(0), prk, 12); | |
253 | prk ^= vextq_u8(vdupq_n_u8(0), prk, 8); | 253 | prk ^= vextq_u8(vdupq_n_u8(0), prk, 8); | |
254 | prk ^= s63; | 254 | prk ^= s63; | |
255 | 255 | |||
256 | /* subbytes */ | 256 | /* subbytes */ | |
257 | subbytes(&io, &jo, rk, inv, inva); | 257 | subbytes(&io, &jo, rk, inv, inva); | |
258 | rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo); | 258 | rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo); | |
259 | 259 | |||
260 | /* add in smeared stuff */ | 260 | /* add in smeared stuff */ | |
261 | return rk ^ prk; | 261 | return rk ^ prk; | |
262 | } | 262 | } | |
263 | 263 | |||
264 | static uint8x16_t | 264 | static uint8x16_t | |
265 | aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot) | 265 | aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot) | |
266 | { | 266 | { | |
267 | uint32x4_t rk32; | 267 | uint32x4_t rk32; | |
268 | 268 | |||
269 | /* extract rcon from rcon_rot */ | 269 | /* extract rcon from rcon_rot */ | |
270 | prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15); | 270 | prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15); | |
271 | *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15); | 271 | *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15); | |
272 | 272 | |||
273 | /* rotate */ | 273 | /* rotate */ | |
274 | rk32 = vreinterpretq_u32_u8(rk); | 274 | rk32 = vreinterpretq_u32_u8(rk); | |
275 | rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3)); | 275 | rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3)); | |
276 | rk = vreinterpretq_u8_u32(rk32); | 276 | rk = vreinterpretq_u8_u32(rk32); | |
277 | rk = vextq_u8(rk, rk, 1); | 277 | rk = vextq_u8(rk, rk, 1); | |
278 | 278 | |||
279 | return aes_schedule_low_round(rk, prk); | 279 | return aes_schedule_low_round(rk, prk); | |
280 | } | 280 | } | |
281 | 281 | |||
282 | static uint8x16_t | 282 | static uint8x16_t | |
283 | aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i) | 283 | aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i) | |
284 | { | 284 | { | |
285 | uint8x16_t y = vdupq_n_u8(0); | 285 | uint8x16_t y = vdupq_n_u8(0); | |
286 | 286 | |||
287 | x ^= s63; | 287 | x ^= s63; | |
288 | 288 | |||
289 | x = vqtbl1q_u8(x, mc_forward[0]); | 289 | x = vqtbl1q_u8(x, mc_forward[0]); | |
290 | y ^= x; | 290 | y ^= x; | |
291 | x = vqtbl1q_u8(x, mc_forward[0]); | 291 | x = vqtbl1q_u8(x, mc_forward[0]); | |
292 | y ^= x; | 292 | y ^= x; | |
293 | x = vqtbl1q_u8(x, mc_forward[0]); | 293 | x = vqtbl1q_u8(x, mc_forward[0]); | |
294 | y ^= x; | 294 | y ^= x; | |
295 | 295 | |||
296 | return vqtbl1q_u8(y, sr_i); | 296 | return vqtbl1q_u8(y, sr_i); | |
297 | } | 297 | } | |
298 | 298 | |||
299 | static uint8x16_t | 299 | static uint8x16_t | |
300 | aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i) | 300 | aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i) | |
301 | { | 301 | { | |
302 | 302 | |||
303 | return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt); | 303 | return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt); | |
304 | } | 304 | } | |
305 | 305 | |||
306 | static uint8x16_t | 306 | static uint8x16_t | |
307 | aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i) | 307 | aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i) | |
308 | { | 308 | { | |
309 | uint8x16_t y = vdupq_n_u8(0); | 309 | uint8x16_t y = vdupq_n_u8(0); | |
310 | 310 | |||
311 | x = aes_schedule_transform(x, dks1); | 311 | x = aes_schedule_transform(x, dks1); | |
312 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | 312 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | |
313 | x = aes_schedule_transform(x, dks2); | 313 | x = aes_schedule_transform(x, dks2); | |
314 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | 314 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | |
315 | x = aes_schedule_transform(x, dks3); | 315 | x = aes_schedule_transform(x, dks3); | |
316 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | 316 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | |
317 | x = aes_schedule_transform(x, dks4); | 317 | x = aes_schedule_transform(x, dks4); | |
318 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | 318 | y = vqtbl1q_u8(y ^ x, mc_forward[0]); | |
319 | 319 | |||
320 | return vqtbl1q_u8(y, sr_i); | 320 | return vqtbl1q_u8(y, sr_i); | |
321 | } | 321 | } | |
322 | 322 | |||
323 | static uint8x16_t | 323 | static uint8x16_t | |
324 | aes_schedule_mangle_last_dec(uint8x16_t x) | 324 | aes_schedule_mangle_last_dec(uint8x16_t x) | |
325 | { | 325 | { | |
326 | 326 | |||
327 | return aes_schedule_transform(x ^ s63, deskew); | 327 | return aes_schedule_transform(x ^ s63, deskew); | |
328 | } | 328 | } | |
329 | 329 | |||
330 | static uint8x16_t | 330 | static uint8x16_t | |
331 | aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk) | 331 | aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk) | |
332 | { | 332 | { | |
333 | uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi); | 333 | uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi); | |
334 | uint32x4_t prk32 = vreinterpretq_u32_u8(prk); | 334 | uint32x4_t prk32 = vreinterpretq_u32_u8(prk); | |
335 | uint32x4_t rk32; | 335 | uint32x4_t rk32; | |
336 | 336 | |||
337 | rk32 = prkhi32; | 337 | rk32 = prkhi32; | |
338 | rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2), | 338 | rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2), | |
339 | vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)), | 339 | vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)), | |
340 | 3); | 340 | 3); | |
341 | rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2), | 341 | rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2), | |
342 | vdupq_n_u32(vgetq_lane_u32(prk32, 3)), | 342 | vdupq_n_u32(vgetq_lane_u32(prk32, 3)), | |
343 | 0); | 343 | 0); | |
344 | 344 | |||
345 | return vreinterpretq_u8_u32(rk32); | 345 | return vreinterpretq_u8_u32(rk32); | |
346 | } | 346 | } | |
347 | 347 | |||
348 | static uint8x16_t | 348 | static uint8x16_t | |
349 | aes_schedule_192_smearhi(uint8x16_t rk) | 349 | aes_schedule_192_smearhi(uint8x16_t rk) | |
350 | { | 350 | { | |
351 | uint64x2_t rk64 = vreinterpretq_u64_u8(rk); | 351 | uint64x2_t rk64 = vreinterpretq_u64_u8(rk); | |
352 | 352 | |||
353 | rk64 = vsetq_lane_u64(0, rk64, 0); | 353 | rk64 = vsetq_lane_u64(0, rk64, 0); | |
354 | 354 | |||
355 | return vreinterpretq_u8_u64(rk64); | 355 | return vreinterpretq_u8_u64(rk64); | |
356 | } | 356 | } | |
357 | 357 | |||
358 | void | 358 | void | |
359 | aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) | 359 | aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) | |
360 | { | 360 | { | |
361 | uint32_t *rk32 = enc->aese_aes.aes_rk; | 361 | uint32_t *rk32 = enc->aese_aes.aes_rk; | |
362 | uint8x16_t mrk; /* mangled round key */ | 362 | uint8x16_t mrk; /* mangled round key */ | |
363 | uint8x16_t rk; /* round key */ | 363 | uint8x16_t rk; /* round key */ | |
364 | uint8x16_t prk; /* previous round key */ | 364 | uint8x16_t prk; /* previous round key */ | |
365 | uint8x16_t rcon_rot = rcon; | 365 | uint8x16_t rcon_rot = rcon; | |
366 | uint64_t i = 3; | 366 | uint64_t i = 3; | |
367 | 367 | |||
368 | /* input transform */ | 368 | /* input transform */ | |
369 | rk = aes_schedule_transform(vld1q_u8(key), ipt); | 369 | rk = aes_schedule_transform(vld1q_u8(key), ipt); | |
370 | storeroundkey(rk32, rk); | 370 | storeroundkey(rk32, rk); | |
371 | rk32 += 4; | 371 | rk32 += 4; | |
372 | 372 | |||
373 | switch (nrounds) { | 373 | switch (nrounds) { | |
374 | case 10: | 374 | case 10: | |
375 | for (;;) { | 375 | for (;;) { | |
376 | rk = aes_schedule_round(rk, rk, &rcon_rot); | 376 | rk = aes_schedule_round(rk, rk, &rcon_rot); | |
377 | if (--nrounds == 0) | 377 | if (--nrounds == 0) | |
378 | break; | 378 | break; | |
379 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | 379 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | |
380 | storeroundkey(rk32, mrk); | 380 | storeroundkey(rk32, mrk); | |
381 | rk32 += 4; | 381 | rk32 += 4; | |
382 | } | 382 | } | |
383 | break; | 383 | break; | |
384 | case 12: { | 384 | case 12: { | |
385 | uint8x16_t prkhi; /* high half of previous round key */ | 385 | uint8x16_t prkhi; /* high half of previous round key */ | |
386 | 386 | |||
387 | prk = rk; | 387 | prk = rk; | |
388 | rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); | 388 | rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); | |
389 | prkhi = aes_schedule_192_smearhi(rk); | 389 | prkhi = aes_schedule_192_smearhi(rk); | |
390 | for (;;) { | 390 | for (;;) { | |
391 | prk = aes_schedule_round(rk, prk, &rcon_rot); | 391 | prk = aes_schedule_round(rk, prk, &rcon_rot); | |
392 | rk = vextq_u8(prkhi, prk, 8); | 392 | rk = vextq_u8(prkhi, prk, 8); | |
393 | 393 | |||
394 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | 394 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | |
395 | storeroundkey(rk32, mrk); | 395 | storeroundkey(rk32, mrk); | |
396 | rk32 += 4; | 396 | rk32 += 4; | |
397 | rk = aes_schedule_192_smear(prkhi, prk); | 397 | rk = aes_schedule_192_smear(prkhi, prk); | |
398 | prkhi = aes_schedule_192_smearhi(rk); | 398 | prkhi = aes_schedule_192_smearhi(rk); | |
399 | 399 | |||
400 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | 400 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | |
401 | storeroundkey(rk32, mrk); | 401 | storeroundkey(rk32, mrk); | |
402 | rk32 += 4; | 402 | rk32 += 4; | |
403 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | 403 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | |
404 | if ((nrounds -= 3) == 0) | 404 | if ((nrounds -= 3) == 0) | |
405 | break; | 405 | break; | |
406 | 406 | |||
407 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | 407 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | |
408 | storeroundkey(rk32, mrk); | 408 | storeroundkey(rk32, mrk); | |
409 | rk32 += 4; | 409 | rk32 += 4; | |
410 | rk = aes_schedule_192_smear(prkhi, prk); | 410 | rk = aes_schedule_192_smear(prkhi, prk); | |
411 | prkhi = aes_schedule_192_smearhi(rk); | 411 | prkhi = aes_schedule_192_smearhi(rk); | |
412 | } | 412 | } | |
413 | break; | 413 | break; | |
414 | } | 414 | } | |
415 | case 14: { | 415 | case 14: { | |
416 | uint8x16_t pprk; /* previous previous round key */ | 416 | uint8x16_t pprk; /* previous previous round key */ | |
417 | 417 | |||
418 | prk = rk; | 418 | prk = rk; | |
419 | rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); | 419 | rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); | |
420 | for (;;) { | 420 | for (;;) { | |
421 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | 421 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | |
422 | storeroundkey(rk32, mrk); | 422 | storeroundkey(rk32, mrk); | |
423 | rk32 += 4; | 423 | rk32 += 4; | |
424 | pprk = rk; | 424 | pprk = rk; | |
425 | 425 | |||
426 | /* high round */ | 426 | /* high round */ | |
427 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | 427 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | |
428 | if ((nrounds -= 2) == 0) | 428 | if ((nrounds -= 2) == 0) | |
429 | break; | 429 | break; | |
430 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | 430 | mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); | |
431 | storeroundkey(rk32, mrk); | 431 | storeroundkey(rk32, mrk); | |
432 | rk32 += 4; | 432 | rk32 += 4; | |
433 | 433 | |||
434 | /* low round */ | 434 | /* low round */ | |
435 | rk = vreinterpretq_u8_u32( | 435 | rk = vreinterpretq_u8_u32( | |
436 | vdupq_n_u32( | 436 | vdupq_n_u32( | |
437 | vgetq_lane_u32(vreinterpretq_u32_u8(rk), | 437 | vgetq_lane_u32(vreinterpretq_u32_u8(rk), | |
438 | 3))); | 438 | 3))); | |
439 | rk = aes_schedule_low_round(rk, pprk); | 439 | rk = aes_schedule_low_round(rk, pprk); | |
440 | } | 440 | } | |
441 | break; | 441 | break; | |
442 | } | 442 | } | |
443 | default: | 443 | default: | |
444 | panic("invalid number of AES rounds: %u", nrounds); | 444 | panic("invalid number of AES rounds: %u", nrounds); | |
445 | } | 445 | } | |
446 | storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4])); | 446 | storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4])); | |
447 | } | 447 | } | |
448 | 448 | |||
449 | void | 449 | void | |
450 | aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) | 450 | aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) | |
451 | { | 451 | { | |
452 | uint32_t *rk32 = dec->aesd_aes.aes_rk; | 452 | uint32_t *rk32 = dec->aesd_aes.aes_rk; | |
453 | uint8x16_t mrk; /* mangled round key */ | 453 | uint8x16_t mrk; /* mangled round key */ | |
454 | uint8x16_t ork; /* original round key */ | 454 | uint8x16_t ork; /* original round key */ | |
455 | uint8x16_t rk; /* round key */ | 455 | uint8x16_t rk; /* round key */ | |
456 | uint8x16_t prk; /* previous round key */ | 456 | uint8x16_t prk; /* previous round key */ | |
457 | uint8x16_t rcon_rot = rcon; | 457 | uint8x16_t rcon_rot = rcon; | |
458 | unsigned i = nrounds == 12 ? 0 : 2; | 458 | unsigned i = nrounds == 12 ? 0 : 2; | |
459 | 459 | |||
460 | ork = vld1q_u8(key); | 460 | ork = vld1q_u8(key); | |
461 | 461 | |||
462 | /* input transform */ | 462 | /* input transform */ | |
463 | rk = aes_schedule_transform(ork, ipt); | 463 | rk = aes_schedule_transform(ork, ipt); | |
464 | 464 | |||
465 | /* go from end */ | 465 | /* go from end */ | |
466 | rk32 += 4*nrounds; | 466 | rk32 += 4*nrounds; | |
467 | storeroundkey(rk32, vqtbl1q_u8(ork, sr[i])); | 467 | storeroundkey(rk32, vqtbl1q_u8(ork, sr[i])); | |
468 | rk32 -= 4; | 468 | rk32 -= 4; | |
469 | i ^= 3; | 469 | i ^= 3; | |
470 | 470 | |||
471 | switch (nrounds) { | 471 | switch (nrounds) { | |
472 | case 10: | 472 | case 10: | |
473 | for (;;) { | 473 | for (;;) { | |
474 | rk = aes_schedule_round(rk, rk, &rcon_rot); | 474 | rk = aes_schedule_round(rk, rk, &rcon_rot); | |
475 | if (--nrounds == 0) | 475 | if (--nrounds == 0) | |
476 | break; | 476 | break; | |
477 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | 477 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | |
478 | storeroundkey(rk32, mrk); | 478 | storeroundkey(rk32, mrk); | |
479 | rk32 -= 4; | 479 | rk32 -= 4; | |
480 | } | 480 | } | |
481 | break; | 481 | break; | |
482 | case 12: { | 482 | case 12: { | |
483 | uint8x16_t prkhi; /* high half of previous round key */ | 483 | uint8x16_t prkhi; /* high half of previous round key */ | |
484 | 484 | |||
485 | prk = rk; | 485 | prk = rk; | |
486 | rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); | 486 | rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); | |
487 | prkhi = aes_schedule_192_smearhi(rk); | 487 | prkhi = aes_schedule_192_smearhi(rk); | |
488 | for (;;) { | 488 | for (;;) { | |
489 | prk = aes_schedule_round(rk, prk, &rcon_rot); | 489 | prk = aes_schedule_round(rk, prk, &rcon_rot); | |
490 | rk = vextq_u8(prkhi, prk, 8); | 490 | rk = vextq_u8(prkhi, prk, 8); | |
491 | 491 | |||
492 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | 492 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | |
493 | storeroundkey(rk32, mrk); | 493 | storeroundkey(rk32, mrk); | |
494 | rk32 -= 4; | 494 | rk32 -= 4; | |
495 | rk = aes_schedule_192_smear(prkhi, prk); | 495 | rk = aes_schedule_192_smear(prkhi, prk); | |
496 | prkhi = aes_schedule_192_smearhi(rk); | 496 | prkhi = aes_schedule_192_smearhi(rk); | |
497 | 497 | |||
498 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | 498 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | |
499 | storeroundkey(rk32, mrk); | 499 | storeroundkey(rk32, mrk); | |
500 | rk32 -= 4; | 500 | rk32 -= 4; | |
501 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | 501 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | |
502 | if ((nrounds -= 3) == 0) | 502 | if ((nrounds -= 3) == 0) | |
503 | break; | 503 | break; | |
504 | 504 | |||
505 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | 505 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | |
506 | storeroundkey(rk32, mrk); | 506 | storeroundkey(rk32, mrk); | |
507 | rk32 -= 4; | 507 | rk32 -= 4; | |
508 | rk = aes_schedule_192_smear(prkhi, prk); | 508 | rk = aes_schedule_192_smear(prkhi, prk); | |
509 | prkhi = aes_schedule_192_smearhi(rk); | 509 | prkhi = aes_schedule_192_smearhi(rk); | |
510 | } | 510 | } | |
511 | break; | 511 | break; | |
512 | } | 512 | } | |
513 | case 14: { | 513 | case 14: { | |
514 | uint8x16_t pprk; /* previous previous round key */ | 514 | uint8x16_t pprk; /* previous previous round key */ | |
515 | 515 | |||
516 | prk = rk; | 516 | prk = rk; | |
517 | rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); | 517 | rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); | |
518 | for (;;) { | 518 | for (;;) { | |
519 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | 519 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | |
520 | storeroundkey(rk32, mrk); | 520 | storeroundkey(rk32, mrk); | |
521 | rk32 -= 4; | 521 | rk32 -= 4; | |
522 | pprk = rk; | 522 | pprk = rk; | |
523 | 523 | |||
524 | /* high round */ | 524 | /* high round */ | |
525 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | 525 | rk = prk = aes_schedule_round(rk, prk, &rcon_rot); | |
526 | if ((nrounds -= 2) == 0) | 526 | if ((nrounds -= 2) == 0) | |
527 | break; | 527 | break; | |
528 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | 528 | mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); | |
529 | storeroundkey(rk32, mrk); | 529 | storeroundkey(rk32, mrk); | |
530 | rk32 -= 4; | 530 | rk32 -= 4; | |
531 | 531 | |||
532 | /* low round */ | 532 | /* low round */ | |
533 | rk = vreinterpretq_u8_u32( | 533 | rk = vreinterpretq_u8_u32( | |
534 | vdupq_n_u32( | 534 | vdupq_n_u32( | |
535 | vgetq_lane_u32(vreinterpretq_u32_u8(rk), | 535 | vgetq_lane_u32(vreinterpretq_u32_u8(rk), | |
536 | 3))); | 536 | 3))); | |
537 | rk = aes_schedule_low_round(rk, pprk); | 537 | rk = aes_schedule_low_round(rk, pprk); | |
538 | } | 538 | } | |
539 | break; | 539 | break; | |
540 | } | 540 | } | |
541 | default: | 541 | default: | |
542 | panic("invalid number of AES rounds: %u", nrounds); | 542 | panic("invalid number of AES rounds: %u", nrounds); | |
543 | } | 543 | } | |
544 | storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); | 544 | storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); | |
545 | } | 545 | } | |
546 | 546 | |||
547 | #ifdef __aarch64__ | 547 | #ifdef __aarch64__ | |
548 | 548 | |||
549 | /* | 549 | /* | |
550 | * GCC does a lousy job of compiling NEON intrinsics for arm32, so we | 550 | * GCC does a lousy job of compiling NEON intrinsics for arm32, so we | |
551 | * do the performance-critical parts -- encryption and decryption -- in | 551 | * do the performance-critical parts -- encryption and decryption -- in | |
552 | * hand-written assembly on arm32. | 552 | * hand-written assembly on arm32. | |
553 | */ | 553 | */ | |
554 | 554 | |||
555 | uint8x16_t | 555 | uint8x16_t | |
556 | aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds) | 556 | aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds) | |
557 | { | 557 | { | |
558 | const uint32_t *rk32 = enc->aese_aes.aes_rk; | 558 | const uint32_t *rk32 = enc->aese_aes.aes_rk; | |
559 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | 559 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |
560 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | 560 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |
561 | uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; | 561 | uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; | |
562 | uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; | 562 | uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; | |
563 | uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; | 563 | uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; | |
564 | uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; | 564 | uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; | |
565 | uint8x16_t io, jo; | 565 | uint8x16_t io, jo; | |
566 | unsigned rmod4 = 0; | 566 | unsigned rmod4 = 0; | |
567 | 567 | |||
568 | x = aes_schedule_transform(x, ipt); | 568 | x = aes_schedule_transform(x, ipt); | |
569 | x ^= loadroundkey(rk32); | 569 | x ^= loadroundkey(rk32); | |
570 | for (;;) { | 570 | for (;;) { | |
571 | uint8x16_t A, A2, A2_B, A2_B_D; | 571 | uint8x16_t A, A2, A2_B, A2_B_D; | |
572 | 572 | |||
573 | subbytes(&io, &jo, x, inv_, inva_); | 573 | subbytes(&io, &jo, x, inv_, inva_); | |
574 | 574 | |||
575 | rk32 += 4; | 575 | rk32 += 4; | |
576 | rmod4 = (rmod4 + 1) % 4; | 576 | rmod4 = (rmod4 + 1) % 4; | |
577 | if (--nrounds == 0) | 577 | if (--nrounds == 0) | |
578 | break; | 578 | break; | |
579 | 579 | |||
580 | A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); | 580 | A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); | |
581 | A ^= loadroundkey(rk32); | 581 | A ^= loadroundkey(rk32); | |
582 | A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); | 582 | A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); | |
583 | A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); | 583 | A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); | |
584 | A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); | 584 | A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); | |
585 | x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); | 585 | x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); | |
586 | } | 586 | } | |
587 | x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); | 587 | x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); | |
588 | x ^= loadroundkey(rk32); | 588 | x ^= loadroundkey(rk32); | |
589 | return vqtbl1q_u8(x, sr[rmod4]); | 589 | return vqtbl1q_u8(x, sr[rmod4]); | |
590 | } | 590 | } | |
591 | 591 | |||
592 | uint8x16x2_t | |||
593 | aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds) | |||
594 | { | |||
595 | const uint32_t *rk32 = enc->aese_aes.aes_rk; | |||
596 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |||
597 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |||
598 | uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; | |||
599 | uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; | |||
600 | uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; | |||
601 | uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; | |||
602 | uint8x16_t x0 = x.val[0], x1 = x.val[1]; | |||
603 | uint8x16_t io0, jo0, io1, jo1; | |||
604 | unsigned rmod4 = 0; | |||
605 | ||||
606 | x0 = aes_schedule_transform(x0, ipt); | |||
607 | x1 = aes_schedule_transform(x1, ipt); | |||
608 | x0 ^= loadroundkey(rk32); | |||
609 | x1 ^= loadroundkey(rk32); | |||
610 | for (;;) { | |||
611 | uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0; | |||
612 | uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1; | |||
613 | ||||
614 | subbytes(&io0, &jo0, x0, inv_, inva_); | |||
615 | subbytes(&io1, &jo1, x1, inv_, inva_); | |||
616 | ||||
617 | rk32 += 4; | |||
618 | rmod4 = (rmod4 + 1) % 4; | |||
619 | if (--nrounds == 0) | |||
620 | break; | |||
621 | ||||
622 | A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0); | |||
623 | A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1); | |||
624 | A_0 ^= loadroundkey(rk32); | |||
625 | A_1 ^= loadroundkey(rk32); | |||
626 | A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0); | |||
627 | A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1); | |||
628 | A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]); | |||
629 | A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]); | |||
630 | A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]); | |||
631 | A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]); | |||
632 | x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]); | |||
633 | x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]); | |||
634 | } | |||
635 | x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0); | |||
636 | x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1); | |||
637 | x0 ^= loadroundkey(rk32); | |||
638 | x1 ^= loadroundkey(rk32); | |||
639 | return (uint8x16x2_t) { .val = { | |||
640 | [0] = vqtbl1q_u8(x0, sr[rmod4]), | |||
641 | [1] = vqtbl1q_u8(x1, sr[rmod4]), | |||
642 | } }; | |||
643 | } | |||
644 | ||||
592 | uint8x16_t | 645 | uint8x16_t | |
593 | aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) | 646 | aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) | |
594 | { | 647 | { | |
595 | const uint32_t *rk32 = dec->aesd_aes.aes_rk; | 648 | const uint32_t *rk32 = dec->aesd_aes.aes_rk; | |
596 | unsigned i = 3 & ~(nrounds - 1); | 649 | unsigned i = 3 & ~(nrounds - 1); | |
597 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | 650 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |
598 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | 651 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |
599 | uint8x16_t io, jo, mc; | 652 | uint8x16_t io, jo, mc; | |
600 | 653 | |||
601 | x = aes_schedule_transform(x, dipt); | 654 | x = aes_schedule_transform(x, dipt); | |
602 | x ^= loadroundkey(rk32); | 655 | x ^= loadroundkey(rk32); | |
603 | rk32 += 4; | 656 | rk32 += 4; | |
604 | 657 | |||
605 | mc = mc_forward[3]; | 658 | mc = mc_forward[3]; | |
606 | for (;;) { | 659 | for (;;) { | |
607 | subbytes(&io, &jo, x, inv_, inva_); | 660 | subbytes(&io, &jo, x, inv_, inva_); | |
608 | if (--nrounds == 0) | 661 | if (--nrounds == 0) | |
609 | break; | 662 | break; | |
610 | 663 | |||
611 | x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo); | 664 | x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo); | |
612 | x ^= loadroundkey(rk32); | 665 | x ^= loadroundkey(rk32); | |
613 | rk32 += 4; /* next round key */ | 666 | rk32 += 4; /* next round key */ | |
614 | 667 | |||
615 | x = vqtbl1q_u8(x, mc); | 668 | x = vqtbl1q_u8(x, mc); | |
616 | x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo); | 669 | x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo); | |
617 | 670 | |||
618 | x = vqtbl1q_u8(x, mc); | 671 | x = vqtbl1q_u8(x, mc); | |
619 | x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); | 672 | x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); | |
620 | 673 | |||
621 | x = vqtbl1q_u8(x, mc); | 674 | x = vqtbl1q_u8(x, mc); | |
622 | x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); | 675 | x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); | |
623 | 676 | |||
624 | mc = vextq_u8(mc, mc, 12); | 677 | mc = vextq_u8(mc, mc, 12); | |
625 | } | 678 | } | |
626 | x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); | 679 | x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); | |
627 | x ^= loadroundkey(rk32); | 680 | x ^= loadroundkey(rk32); | |
628 | return vqtbl1q_u8(x, sr[i]); | 681 | return vqtbl1q_u8(x, sr[i]); | |
629 | } | 682 | } | |
630 | 683 | |||
684 | uint8x16x2_t | |||
685 | aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds) | |||
686 | { | |||
687 | const uint32_t *rk32 = dec->aesd_aes.aes_rk; | |||
688 | unsigned i = 3 & ~(nrounds - 1); | |||
689 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |||
690 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |||
691 | uint8x16_t x0 = x.val[0], x1 = x.val[1]; | |||
692 | uint8x16_t io0, jo0, io1, jo1, mc; | |||
693 | ||||
694 | x0 = aes_schedule_transform(x0, dipt); | |||
695 | x1 = aes_schedule_transform(x1, dipt); | |||
696 | x0 ^= loadroundkey(rk32); | |||
697 | x1 ^= loadroundkey(rk32); | |||
698 | rk32 += 4; | |||
699 | ||||
700 | mc = mc_forward[3]; | |||
701 | for (;;) { | |||
702 | subbytes(&io0, &jo0, x0, inv_, inva_); | |||
703 | subbytes(&io1, &jo1, x1, inv_, inva_); | |||
704 | if (--nrounds == 0) | |||
705 | break; | |||
706 | ||||
707 | x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0); | |||
708 | x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1); | |||
709 | x0 ^= loadroundkey(rk32); | |||
710 | x1 ^= loadroundkey(rk32); | |||
711 | rk32 += 4; /* next round key */ | |||
712 | ||||
713 | x0 = vqtbl1q_u8(x0, mc); | |||
714 | x1 = vqtbl1q_u8(x1, mc); | |||
715 | x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0); | |||
716 | x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1); | |||
717 | ||||
718 | x0 = vqtbl1q_u8(x0, mc); | |||
719 | x1 = vqtbl1q_u8(x1, mc); | |||
720 | x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0); | |||
721 | x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1); | |||
722 | ||||
723 | x0 = vqtbl1q_u8(x0, mc); | |||
724 | x1 = vqtbl1q_u8(x1, mc); | |||
725 | x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0); | |||
726 | x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1); | |||
727 | ||||
728 | mc = vextq_u8(mc, mc, 12); | |||
729 | } | |||
730 | x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0); | |||
731 | x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1); | |||
732 | x0 ^= loadroundkey(rk32); | |||
733 | x1 ^= loadroundkey(rk32); | |||
734 | return (uint8x16x2_t) { .val = { | |||
735 | [0] = vqtbl1q_u8(x0, sr[i]), | |||
736 | [1] = vqtbl1q_u8(x1, sr[i]), | |||
737 | } }; | |||
738 | } | |||
739 | ||||
631 | #endif | 740 | #endif |
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/25 22:36:06 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/28 20:11:09 1.4
@@ -1,309 +1,382 @@ | @@ -1,309 +1,382 @@ | |||
1 | /* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ | 1 | /* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
15 | * | 15 | * | |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
26 | * POSSIBILITY OF SUCH DAMAGE. | 26 | * POSSIBILITY OF SUCH DAMAGE. | |
27 | */ | 27 | */ | |
28 | 28 | |||
29 | #include <sys/cdefs.h> | 29 | #include <sys/cdefs.h> | |
30 | __KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $"); | 30 | __KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); | |
31 | 31 | |||
32 | #include <sys/endian.h> | 32 | #include <sys/endian.h> | |
33 | 33 | |||
34 | #ifdef _KERNEL | 34 | #ifdef _KERNEL | |
35 | #include <sys/systm.h> | 35 | #include <sys/systm.h> | |
36 | #include <lib/libkern/libkern.h> | 36 | #include <lib/libkern/libkern.h> | |
37 | #else | 37 | #else | |
38 | #include <assert.h> | 38 | #include <assert.h> | |
39 | #include <inttypes.h> | 39 | #include <inttypes.h> | |
40 | #include <stdio.h> | 40 | #include <stdio.h> | |
41 | #define KASSERT assert | 41 | #define KASSERT assert | |
42 | #endif | 42 | #endif | |
43 | 43 | |||
44 | #include <crypto/aes/arch/arm/aes_neon.h> | 44 | #include <crypto/aes/arch/arm/aes_neon.h> | |
45 | 45 | |||
46 | #include "aes_neon_impl.h" | 46 | #include "aes_neon_impl.h" | |
47 | 47 | |||
48 | static inline uint8x16_t | 48 | static inline uint8x16_t | |
49 | loadblock(const void *in) | 49 | loadblock(const void *in) | |
50 | { | 50 | { | |
51 | return vld1q_u8(in); | 51 | return vld1q_u8(in); | |
52 | } | 52 | } | |
53 | 53 | |||
54 | static inline void | 54 | static inline void | |
55 | storeblock(void *out, uint8x16_t block) | 55 | storeblock(void *out, uint8x16_t block) | |
56 | { | 56 | { | |
57 | vst1q_u8(out, block); | 57 | vst1q_u8(out, block); | |
58 | } | 58 | } | |
59 | 59 | |||
60 | void | 60 | void | |
61 | aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16], | 61 | aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16], | |
62 | uint8_t out[static 16], uint32_t nrounds) | 62 | uint8_t out[static 16], uint32_t nrounds) | |
63 | { | 63 | { | |
64 | uint8x16_t block; | 64 | uint8x16_t block; | |
65 | 65 | |||
66 | block = loadblock(in); | 66 | block = loadblock(in); | |
67 | block = aes_neon_enc1(enc, block, nrounds); | 67 | block = aes_neon_enc1(enc, block, nrounds); | |
68 | storeblock(out, block); | 68 | storeblock(out, block); | |
69 | } | 69 | } | |
70 | 70 | |||
71 | void | 71 | void | |
72 | aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16], | 72 | aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16], | |
73 | uint8_t out[static 16], uint32_t nrounds) | 73 | uint8_t out[static 16], uint32_t nrounds) | |
74 | { | 74 | { | |
75 | uint8x16_t block; | 75 | uint8x16_t block; | |
76 | 76 | |||
77 | block = loadblock(in); | 77 | block = loadblock(in); | |
78 | block = aes_neon_dec1(dec, block, nrounds); | 78 | block = aes_neon_dec1(dec, block, nrounds); | |
79 | storeblock(out, block); | 79 | storeblock(out, block); | |
80 | } | 80 | } | |
81 | 81 | |||
82 | void | 82 | void | |
83 | aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], | 83 | aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], | |
84 | uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], | 84 | uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], | |
85 | uint32_t nrounds) | 85 | uint32_t nrounds) | |
86 | { | 86 | { | |
87 | uint8x16_t cv; | 87 | uint8x16_t cv; | |
88 | 88 | |||
89 | KASSERT(nbytes); | 89 | KASSERT(nbytes); | |
90 | 90 | |||
91 | cv = loadblock(iv); | 91 | cv = loadblock(iv); | |
92 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 92 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | |
93 | cv ^= loadblock(in); | 93 | cv ^= loadblock(in); | |
94 | cv = aes_neon_enc1(enc, cv, nrounds); | 94 | cv = aes_neon_enc1(enc, cv, nrounds); | |
95 | storeblock(out, cv); | 95 | storeblock(out, cv); | |
96 | } | 96 | } | |
97 | storeblock(iv, cv); | 97 | storeblock(iv, cv); | |
98 | } | 98 | } | |
99 | 99 | |||
100 | void | 100 | void | |
101 | aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], | 101 | aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], | |
102 | uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], | 102 | uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], | |
103 | uint32_t nrounds) | 103 | uint32_t nrounds) | |
104 | { | 104 | { | |
105 | uint8x16_t iv0, cv, b; | 105 | uint8x16_t iv0, cv, b; | |
106 | 106 | |||
107 | KASSERT(nbytes); | 107 | KASSERT(nbytes); | |
108 | KASSERT(nbytes % 16 == 0); | 108 | KASSERT(nbytes % 16 == 0); | |
109 | 109 | |||
110 | iv0 = loadblock(iv); | 110 | iv0 = loadblock(iv); | |
111 | cv = loadblock(in + nbytes - 16); | 111 | cv = loadblock(in + nbytes - 16); | |
112 | storeblock(iv, cv); | 112 | storeblock(iv, cv); | |
113 | 113 | |||
114 | for (;;) { | 114 | if (nbytes % 32) { | |
115 | KASSERT(nbytes % 32 == 16); | |||
115 | b = aes_neon_dec1(dec, cv, nrounds); | 116 | b = aes_neon_dec1(dec, cv, nrounds); | |
116 | if ((nbytes -= 16) == 0) | 117 | if ((nbytes -= 16) == 0) | |
117 | break; | 118 | goto out; | |
119 | cv = loadblock(in + nbytes - 16); | |||
120 | storeblock(out + nbytes, cv ^ b); | |||
121 | } | |||
122 | ||||
123 | for (;;) { | |||
124 | uint8x16x2_t b2; | |||
125 | ||||
126 | KASSERT(nbytes >= 32); | |||
127 | ||||
128 | b2.val[1] = cv; | |||
129 | b2.val[0] = cv = loadblock(in + nbytes - 32); | |||
130 | b2 = aes_neon_dec2(dec, b2, nrounds); | |||
131 | storeblock(out + nbytes - 16, cv ^ b2.val[1]); | |||
132 | if ((nbytes -= 32) == 0) { | |||
133 | b = b2.val[0]; | |||
134 | goto out; | |||
135 | } | |||
118 | cv = loadblock(in + nbytes - 16); | 136 | cv = loadblock(in + nbytes - 16); | |
119 | storeblock(out + nbytes, b ^ cv); | 137 | storeblock(out + nbytes, cv ^ b2.val[0]); | |
120 | } | 138 | } | |
121 | storeblock(out, b ^ iv0); | 139 | ||
140 | out: storeblock(out, b ^ iv0); | |||
122 | } | 141 | } | |
123 | 142 | |||
124 | static inline uint8x16_t | 143 | static inline uint8x16_t | |
125 | aes_neon_xts_update(uint8x16_t t8) | 144 | aes_neon_xts_update(uint8x16_t t8) | |
126 | { | 145 | { | |
127 | const int32x4_t zero = vdupq_n_s32(0); | 146 | const int32x4_t zero = vdupq_n_s32(0); | |
128 | const int32x4_t carry = {0x87, 1, 1, 1}; | 147 | const int32x4_t carry = {0x87, 1, 1, 1}; | |
129 | int32x4_t t, t_; | 148 | int32x4_t t, t_; | |
130 | uint32x4_t mask; | 149 | uint32x4_t mask; | |
131 | 150 | |||
132 | t = vreinterpretq_s32_u8(t8); | 151 | t = vreinterpretq_s32_u8(t8); | |
133 | mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ | 152 | mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ | |
134 | mask = vextq_u32(mask, mask, 3); /* rotate quarters */ | 153 | mask = vextq_u32(mask, mask, 3); /* rotate quarters */ | |
135 | t_ = vsliq_n_s32(zero, t, 1); /* shift */ | 154 | t_ = vsliq_n_s32(zero, t, 1); /* shift */ | |
136 | t_ ^= carry & mask; | 155 | t_ ^= carry & mask; | |
137 | 156 | |||
138 | return vreinterpretq_u8_s32(t_); | 157 | return vreinterpretq_u8_s32(t_); | |
139 | } | 158 | } | |
140 | 159 | |||
141 | static int | 160 | static int | |
142 | aes_neon_xts_update_selftest(void) | 161 | aes_neon_xts_update_selftest(void) | |
143 | { | 162 | { | |
144 | static const struct { | 163 | static const struct { | |
145 | uint32_t in[4], out[4]; | 164 | uint32_t in[4], out[4]; | |
146 | } cases[] = { | 165 | } cases[] = { | |
147 | [0] = { {1}, {2} }, | 166 | [0] = { {1}, {2} }, | |
148 | [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, | 167 | [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, | |
149 | [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, | 168 | [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, | |
150 | [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, | 169 | [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, | |
151 | [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, | 170 | [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, | |
152 | [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, | 171 | [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, | |
153 | }; | 172 | }; | |
154 | unsigned i; | 173 | unsigned i; | |
155 | uint32_t t[4]; | 174 | uint32_t t[4]; | |
156 | int result = 0; | 175 | int result = 0; | |
157 | 176 | |||
158 | for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { | 177 | for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { | |
159 | t[0] = cases[i].in[0]; | 178 | t[0] = cases[i].in[0]; | |
160 | t[1] = cases[i].in[1]; | 179 | t[1] = cases[i].in[1]; | |
161 | t[2] = cases[i].in[2]; | 180 | t[2] = cases[i].in[2]; | |
162 | t[3] = cases[i].in[3]; | 181 | t[3] = cases[i].in[3]; | |
163 | storeblock(t, aes_neon_xts_update(loadblock(t))); | 182 | storeblock(t, aes_neon_xts_update(loadblock(t))); | |
164 | if (t[0] != cases[i].out[0] || | 183 | if (t[0] != cases[i].out[0] || | |
165 | t[1] != cases[i].out[1] || | 184 | t[1] != cases[i].out[1] || | |
166 | t[2] != cases[i].out[2] || | 185 | t[2] != cases[i].out[2] || | |
167 | t[3] != cases[i].out[3]) { | 186 | t[3] != cases[i].out[3]) { | |
168 | printf("%s %u:" | 187 | printf("%s %u:" | |
169 | " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", | 188 | " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", | |
170 | __func__, i, t[0], t[1], t[2], t[3]); | 189 | __func__, i, t[0], t[1], t[2], t[3]); | |
171 | result = -1; | 190 | result = -1; | |
172 | } | 191 | } | |
173 | } | 192 | } | |
174 | 193 | |||
175 | return result; | 194 | return result; | |
176 | } | 195 | } | |
177 | 196 | |||
178 | void | 197 | void | |
179 | aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], | 198 | aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], | |
180 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | 199 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | |
181 | uint32_t nrounds) | 200 | uint32_t nrounds) | |
182 | { | 201 | { | |
183 | uint8x16_t t, b; | 202 | uint8x16_t t, b; | |
184 | 203 | |||
185 | KASSERT(nbytes); | 204 | KASSERT(nbytes); | |
186 | KASSERT(nbytes % 16 == 0); | 205 | KASSERT(nbytes % 16 == 0); | |
187 | 206 | |||
188 | t = loadblock(tweak); | 207 | t = loadblock(tweak); | |
189 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 208 | if (nbytes % 32) { | |
209 | KASSERT(nbytes % 32 == 16); | |||
190 | b = t ^ loadblock(in); | 210 | b = t ^ loadblock(in); | |
191 | b = aes_neon_enc1(enc, b, nrounds); | 211 | b = aes_neon_enc1(enc, b, nrounds); | |
192 | storeblock(out, t ^ b); | 212 | storeblock(out, t ^ b); | |
193 | t = aes_neon_xts_update(t); | 213 | t = aes_neon_xts_update(t); | |
214 | nbytes -= 16; | |||
215 | in += 16; | |||
216 | out += 16; | |||
217 | } | |||
218 | for (; nbytes; nbytes -= 32, in += 32, out += 32) { | |||
219 | uint8x16_t t1; | |||
220 | uint8x16x2_t b2; | |||
221 | ||||
222 | t1 = aes_neon_xts_update(t); | |||
223 | b2.val[0] = t ^ loadblock(in); | |||
224 | b2.val[1] = t1 ^ loadblock(in + 16); | |||
225 | b2 = aes_neon_enc2(enc, b2, nrounds); | |||
226 | storeblock(out, b2.val[0] ^ t); | |||
227 | storeblock(out + 16, b2.val[1] ^ t1); | |||
228 | ||||
229 | t = aes_neon_xts_update(t1); | |||
194 | } | 230 | } | |
195 | storeblock(tweak, t); | 231 | storeblock(tweak, t); | |
196 | } | 232 | } | |
197 | 233 | |||
198 | void | 234 | void | |
199 | aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], | 235 | aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], | |
200 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | 236 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | |
201 | uint32_t nrounds) | 237 | uint32_t nrounds) | |
202 | { | 238 | { | |
203 | uint8x16_t t, b; | 239 | uint8x16_t t, b; | |
204 | 240 | |||
205 | KASSERT(nbytes); | 241 | KASSERT(nbytes); | |
206 | KASSERT(nbytes % 16 == 0); | 242 | KASSERT(nbytes % 16 == 0); | |
207 | 243 | |||
208 | t = loadblock(tweak); | 244 | t = loadblock(tweak); | |
209 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 245 | if (nbytes % 32) { | |
246 | KASSERT(nbytes % 32 == 16); | |||
210 | b = t ^ loadblock(in); | 247 | b = t ^ loadblock(in); | |
211 | b = aes_neon_dec1(dec, b, nrounds); | 248 | b = aes_neon_dec1(dec, b, nrounds); | |
212 | storeblock(out, t ^ b); | 249 | storeblock(out, t ^ b); | |
213 | t = aes_neon_xts_update(t); | 250 | t = aes_neon_xts_update(t); | |
251 | nbytes -= 16; | |||
252 | in += 16; | |||
253 | out += 16; | |||
254 | } | |||
255 | for (; nbytes; nbytes -= 32, in += 32, out += 32) { | |||
256 | uint8x16_t t1; | |||
257 | uint8x16x2_t b2; | |||
258 | ||||
259 | t1 = aes_neon_xts_update(t); | |||
260 | b2.val[0] = t ^ loadblock(in); | |||
261 | b2.val[1] = t1 ^ loadblock(in + 16); | |||
262 | b2 = aes_neon_dec2(dec, b2, nrounds); | |||
263 | storeblock(out, b2.val[0] ^ t); | |||
264 | storeblock(out + 16, b2.val[1] ^ t1); | |||
265 | ||||
266 | t = aes_neon_xts_update(t1); | |||
214 | } | 267 | } | |
215 | storeblock(tweak, t); | 268 | storeblock(tweak, t); | |
216 | } | 269 | } | |
217 | 270 | |||
218 | void | 271 | void | |
219 | aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], | 272 | aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], | |
220 | size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) | 273 | size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) | |
221 | { | 274 | { | |
222 | uint8x16_t auth; | 275 | uint8x16_t auth; | |
223 | 276 | |||
224 | KASSERT(nbytes); | 277 | KASSERT(nbytes); | |
225 | KASSERT(nbytes % 16 == 0); | 278 | KASSERT(nbytes % 16 == 0); | |
226 | 279 | |||
227 | auth = loadblock(auth0); | 280 | auth = loadblock(auth0); | |
228 | for (; nbytes; nbytes -= 16, in += 16) | 281 | for (; nbytes; nbytes -= 16, in += 16) | |
229 | auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds); | 282 | auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds); | |
230 | storeblock(auth0, auth); | 283 | storeblock(auth0, auth); | |
231 | } | 284 | } | |
232 | 285 | |||
233 | /* | 286 | /* | |
234 | * XXX On aarch64, we have enough registers that we should be able to | 287 | * XXX On aarch64, we have enough registers that we should be able to | |
235 | * pipeline two simultaneous vpaes computations in an `aes_neon_enc2' | 288 | * pipeline two simultaneous vpaes computations in an `aes_neon_enc2' | |
236 | * function, which should substantially improve CCM throughput. | 289 | * function, which should substantially improve CCM throughput. | |
237 | */ | 290 | */ | |
238 | 291 | |||
239 | #if _BYTE_ORDER == _LITTLE_ENDIAN | 292 | #if _BYTE_ORDER == _LITTLE_ENDIAN | |
240 | #define vbetoh32q_u8 vrev32q_u8 | 293 | #define vbetoh32q_u8 vrev32q_u8 | |
241 | #define vhtobe32q_u8 vrev32q_u8 | 294 | #define vhtobe32q_u8 vrev32q_u8 | |
242 | #elif _BYTE_ORDER == _BIG_ENDIAN | 295 | #elif _BYTE_ORDER == _BIG_ENDIAN | |
243 | #define vbetoh32q_u8(x) (x) | 296 | #define vbetoh32q_u8(x) (x) | |
244 | #define vhtobe32q_u8(x) (x) | 297 | #define vhtobe32q_u8(x) (x) | |
245 | #else | 298 | #else | |
246 | #error what kind of endian are you anyway | 299 | #error what kind of endian are you anyway | |
247 | #endif | 300 | #endif | |
248 | 301 | |||
249 | void | 302 | void | |
250 | aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], | 303 | aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], | |
251 | uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], | 304 | uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], | |
252 | uint32_t nrounds) | 305 | uint32_t nrounds) | |
253 | { | 306 | { | |
254 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | 307 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | |
255 | uint8x16_t auth, ptxt, ctr_be; | 308 | uint8x16_t auth, ptxt, ctr_be; | |
256 | uint32x4_t ctr; | 309 | uint32x4_t ctr; | |
257 | 310 | |||
258 | KASSERT(nbytes); | 311 | KASSERT(nbytes); | |
259 | KASSERT(nbytes % 16 == 0); | 312 | KASSERT(nbytes % 16 == 0); | |
260 | 313 | |||
261 | auth = loadblock(authctr); | 314 | auth = loadblock(authctr); | |
262 | ctr_be = loadblock(authctr + 16); | 315 | ctr_be = loadblock(authctr + 16); | |
263 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | 316 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | |
264 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 317 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | |
318 | uint8x16x2_t b2; | |||
265 | ptxt = loadblock(in); | 319 | ptxt = loadblock(in); | |
266 | auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); | |||
267 | ctr = vaddq_u32(ctr, ctr32_inc); | 320 | ctr = vaddq_u32(ctr, ctr32_inc); | |
268 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | 321 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | |
269 | storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds)); | 322 | ||
323 | b2.val[0] = auth ^ ptxt; | |||
324 | b2.val[1] = ctr_be; | |||
325 | b2 = aes_neon_enc2(enc, b2, nrounds); | |||
326 | auth = b2.val[0]; | |||
327 | storeblock(out, ptxt ^ b2.val[1]); | |||
270 | } | 328 | } | |
271 | storeblock(authctr, auth); | 329 | storeblock(authctr, auth); | |
272 | storeblock(authctr + 16, ctr_be); | 330 | storeblock(authctr + 16, ctr_be); | |
273 | } | 331 | } | |
274 | 332 | |||
275 | void | 333 | void | |
276 | aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], | 334 | aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], | |
277 | uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], | 335 | uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], | |
278 | uint32_t nrounds) | 336 | uint32_t nrounds) | |
279 | { | 337 | { | |
280 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | 338 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | |
281 | uint8x16_t auth, ctr_be, ptxt; | 339 | uint8x16_t auth, ctr_be, ptxt, pad; | |
282 | uint32x4_t ctr; | 340 | uint32x4_t ctr; | |
283 | 341 | |||
284 | KASSERT(nbytes); | 342 | KASSERT(nbytes); | |
285 | KASSERT(nbytes % 16 == 0); | 343 | KASSERT(nbytes % 16 == 0); | |
286 | 344 | |||
287 | auth = loadblock(authctr); | |||
288 | ctr_be = loadblock(authctr + 16); | 345 | ctr_be = loadblock(authctr + 16); | |
289 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | 346 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | |
290 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 347 | ctr = vaddq_u32(ctr, ctr32_inc); | |
348 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | |||
349 | pad = aes_neon_enc1(enc, ctr_be, nrounds); | |||
350 | auth = loadblock(authctr); | |||
351 | for (;; in += 16, out += 16) { | |||
352 | uint8x16x2_t b2; | |||
353 | ||||
354 | ptxt = loadblock(in) ^ pad; | |||
355 | auth ^= ptxt; | |||
356 | storeblock(out, ptxt); | |||
357 | ||||
358 | if ((nbytes -= 16) == 0) | |||
359 | break; | |||
360 | ||||
291 | ctr = vaddq_u32(ctr, ctr32_inc); | 361 | ctr = vaddq_u32(ctr, ctr32_inc); | |
292 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | 362 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | |
293 | ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds); | 363 | b2.val[0] = auth; | |
294 | storeblock(out, ptxt); | 364 | b2.val[1] = ctr_be; | |
295 | auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); | 365 | b2 = aes_neon_enc2(enc, b2, nrounds); | |
366 | auth = b2.val[0]; | |||
367 | pad = b2.val[1]; | |||
296 | } | 368 | } | |
369 | auth = aes_neon_enc1(enc, auth, nrounds); | |||
297 | storeblock(authctr, auth); | 370 | storeblock(authctr, auth); | |
298 | storeblock(authctr + 16, ctr_be); | 371 | storeblock(authctr + 16, ctr_be); | |
299 | } | 372 | } | |
300 | 373 | |||
301 | int | 374 | int | |
302 | aes_neon_selftest(void) | 375 | aes_neon_selftest(void) | |
303 | { | 376 | { | |
304 | 377 | |||
305 | if (aes_neon_xts_update_selftest()) | 378 | if (aes_neon_xts_update_selftest()) | |
306 | return -1; | 379 | return -1; | |
307 | 380 | |||
308 | return 0; | 381 | return 0; | |
309 | } | 382 | } |
--- src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/06/29 23:56:31 1.1
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/07/28 20:11:09 1.2
@@ -1,42 +1,71 @@ | @@ -1,42 +1,71 @@ | |||
1 | /* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ | 1 | /* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
15 | * | 15 | * | |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
26 | * POSSIBILITY OF SUCH DAMAGE. | 26 | * POSSIBILITY OF SUCH DAMAGE. | |
27 | */ | 27 | */ | |
28 | 28 | |||
29 | #ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | 29 | #ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | |
30 | #define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | 30 | #define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | |
31 | 31 | |||
32 | #include <sys/types.h> | 32 | #include <sys/types.h> | |
33 | 33 | |||
34 | #include "arm_neon.h" | 34 | #include "arm_neon.h" | |
35 | 35 | |||
36 | #include <crypto/aes/aes.h> | 36 | #include <crypto/aes/aes.h> | |
37 | #include <crypto/aes/arch/arm/aes_neon.h> | 37 | #include <crypto/aes/arch/arm/aes_neon.h> | |
38 | 38 | |||
39 | uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); | 39 | uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); | |
40 | uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); | 40 | uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); | |
41 | 41 | |||
42 | #ifdef __aarch64__ | |||
43 | ||||
44 | uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned); | |||
45 | uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned); | |||
46 | ||||
47 | #else | |||
48 | ||||
49 | static inline uint8x16x2_t | |||
50 | aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds) | |||
51 | { | |||
52 | ||||
53 | return (uint8x16x2_t) { .val = { | |||
54 | [0] = aes_neon_enc1(enc, b2.val[0], nrounds), | |||
55 | [1] = aes_neon_enc1(enc, b2.val[1], nrounds), | |||
56 | } }; | |||
57 | } | |||
58 | ||||
59 | static inline uint8x16x2_t | |||
60 | aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds) | |||
61 | { | |||
62 | ||||
63 | return (uint8x16x2_t) { .val = { | |||
64 | [0] = aes_neon_dec1(dec, b2.val[0], nrounds), | |||
65 | [1] = aes_neon_dec1(dec, b2.val[1], nrounds), | |||
66 | } }; | |||
67 | } | |||
68 | ||||
69 | #endif | |||
70 | ||||
42 | #endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ | 71 | #endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ |
--- src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/25 22:43:01 1.6
+++ src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/28 20:11:09 1.7
@@ -1,534 +1,536 @@ | @@ -1,534 +1,536 @@ | |||
1 | /* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */ | 1 | /* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
15 | * | 15 | * | |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | 16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
26 | * POSSIBILITY OF SUCH DAMAGE. | 26 | * POSSIBILITY OF SUCH DAMAGE. | |
27 | */ | 27 | */ | |
28 | 28 | |||
29 | #ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H | 29 | #ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H | |
30 | #define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H | 30 | #define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H | |
31 | 31 | |||
32 | #if defined(__GNUC__) && !defined(__clang__) | 32 | #if defined(__GNUC__) && !defined(__clang__) | |
33 | 33 | |||
34 | #define _INTRINSATTR \ | 34 | #define _INTRINSATTR \ | |
35 | __extension__ \ | 35 | __extension__ \ | |
36 | __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | 36 | __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | |
37 | 37 | |||
38 | #ifdef __aarch64__ | 38 | #ifdef __aarch64__ | |
39 | typedef __Int32x4_t int32x4_t; | 39 | typedef __Int32x4_t int32x4_t; | |
40 | typedef __Int64x2_t int64x2_t; | 40 | typedef __Int64x2_t int64x2_t; | |
41 | typedef __Int8x16_t int8x16_t; | 41 | typedef __Int8x16_t int8x16_t; | |
42 | typedef __Uint32x4_t uint32x4_t; | 42 | typedef __Uint32x4_t uint32x4_t; | |
43 | typedef __Uint64x2_t uint64x2_t; | 43 | typedef __Uint64x2_t uint64x2_t; | |
44 | typedef __Uint8x16_t uint8x16_t; | 44 | typedef __Uint8x16_t uint8x16_t; | |
45 | typedef struct { uint8x16_t val[2]; } uint8x16x2_t; | |||
45 | #else | 46 | #else | |
46 | typedef __simd128_int32_t int32x4_t; | 47 | typedef __simd128_int32_t int32x4_t; | |
47 | typedef __simd128_int64_t int64x2_t; | 48 | typedef __simd128_int64_t int64x2_t; | |
48 | typedef __simd128_int8_t int8x16_t; | 49 | typedef __simd128_int8_t int8x16_t; | |
49 | typedef __simd128_uint32_t uint32x4_t; | 50 | typedef __simd128_uint32_t uint32x4_t; | |
50 | typedef __simd128_uint64_t uint64x2_t; | 51 | typedef __simd128_uint64_t uint64x2_t; | |
51 | typedef __simd128_uint8_t uint8x16_t; | 52 | typedef __simd128_uint8_t uint8x16_t; | |
52 | 53 | |||
53 | typedef __simd64_int8_t int8x8_t; | 54 | typedef __simd64_int8_t int8x8_t; | |
54 | typedef __simd64_uint8_t uint8x8_t; | 55 | typedef __simd64_uint8_t uint8x8_t; | |
55 | typedef __builtin_neon_udi uint64x1_t; | 56 | typedef __builtin_neon_udi uint64x1_t; | |
56 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | 57 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | |
58 | typedef struct { uint8x16_t val[2]; } uint8x16x2_t; | |||
57 | #endif | 59 | #endif | |
58 | 60 | |||
59 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | 61 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | |
60 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | 62 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | |
61 | #else | 63 | #else | |
62 | #define __neon_lane_index(__v, __i) __i | 64 | #define __neon_lane_index(__v, __i) __i | |
63 | #endif | 65 | #endif | |
64 | 66 | |||
65 | #elif defined(__clang__) | 67 | #elif defined(__clang__) | |
66 | 68 | |||
67 | #define _INTRINSATTR \ | 69 | #define _INTRINSATTR \ | |
68 | __attribute__((__always_inline__, __nodebug__)) | 70 | __attribute__((__always_inline__, __nodebug__)) | |
69 | 71 | |||
70 | typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; | 72 | typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; | |
71 | typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; | 73 | typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; | |
72 | typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; | 74 | typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; | |
73 | typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; | 75 | typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; | |
74 | typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; | 76 | typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; | |
75 | typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; | 77 | typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; | |
76 | 78 | |||
77 | typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; | 79 | typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; | |
78 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | 80 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | |
79 | 81 | |||
80 | #ifdef __LITTLE_ENDIAN__ | 82 | #ifdef __LITTLE_ENDIAN__ | |
81 | #define __neon_lane_index(__v, __i) __i | 83 | #define __neon_lane_index(__v, __i) __i | |
82 | #else | 84 | #else | |
83 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | 85 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | |
84 | #endif | 86 | #endif | |
85 | 87 | |||
86 | #else | 88 | #else | |
87 | 89 | |||
88 | #error Teach me how to neon in your compile! | 90 | #error Teach me how to neon in your compile! | |
89 | 91 | |||
90 | #endif | 92 | #endif | |
91 | 93 | |||
92 | _INTRINSATTR | 94 | _INTRINSATTR | |
93 | static __inline uint32x4_t | 95 | static __inline uint32x4_t | |
94 | vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) | 96 | vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) | |
95 | { | 97 | { | |
96 | return __v0 + __v1; | 98 | return __v0 + __v1; | |
97 | } | 99 | } | |
98 | 100 | |||
99 | _INTRINSATTR | 101 | _INTRINSATTR | |
100 | static __inline uint32x4_t | 102 | static __inline uint32x4_t | |
101 | vcltq_s32(int32x4_t __v0, int32x4_t __v1) | 103 | vcltq_s32(int32x4_t __v0, int32x4_t __v1) | |
102 | { | 104 | { | |
103 | return (uint32x4_t)(__v0 < __v1); | 105 | return (uint32x4_t)(__v0 < __v1); | |
104 | } | 106 | } | |
105 | 107 | |||
106 | _INTRINSATTR | 108 | _INTRINSATTR | |
107 | static __inline int32x4_t | 109 | static __inline int32x4_t | |
108 | vdupq_n_s32(int32_t __x) | 110 | vdupq_n_s32(int32_t __x) | |
109 | { | 111 | { | |
110 | return (int32x4_t) { __x, __x, __x, __x }; | 112 | return (int32x4_t) { __x, __x, __x, __x }; | |
111 | } | 113 | } | |
112 | 114 | |||
113 | _INTRINSATTR | 115 | _INTRINSATTR | |
114 | static __inline uint32x4_t | 116 | static __inline uint32x4_t | |
115 | vdupq_n_u32(uint32_t __x) | 117 | vdupq_n_u32(uint32_t __x) | |
116 | { | 118 | { | |
117 | return (uint32x4_t) { __x, __x, __x, __x }; | 119 | return (uint32x4_t) { __x, __x, __x, __x }; | |
118 | } | 120 | } | |
119 | 121 | |||
120 | _INTRINSATTR | 122 | _INTRINSATTR | |
121 | static __inline uint8x16_t | 123 | static __inline uint8x16_t | |
122 | vdupq_n_u8(uint8_t __x) | 124 | vdupq_n_u8(uint8_t __x) | |
123 | { | 125 | { | |
124 | return (uint8x16_t) { | 126 | return (uint8x16_t) { | |
125 | __x, __x, __x, __x, __x, __x, __x, __x, | 127 | __x, __x, __x, __x, __x, __x, __x, __x, | |
126 | __x, __x, __x, __x, __x, __x, __x, __x, | 128 | __x, __x, __x, __x, __x, __x, __x, __x, | |
127 | }; | 129 | }; | |
128 | } | 130 | } | |
129 | 131 | |||
130 | #if defined(__GNUC__) && !defined(__clang__) | 132 | #if defined(__GNUC__) && !defined(__clang__) | |
131 | _INTRINSATTR | 133 | _INTRINSATTR | |
132 | static __inline uint32x4_t | 134 | static __inline uint32x4_t | |
133 | vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) | 135 | vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) | |
134 | { | 136 | { | |
135 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | 137 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | |
136 | return __builtin_shuffle(__hi, __lo, | 138 | return __builtin_shuffle(__hi, __lo, | |
137 | (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); | 139 | (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); | |
138 | #else | 140 | #else | |
139 | return __builtin_shuffle(__lo, __hi, | 141 | return __builtin_shuffle(__lo, __hi, | |
140 | (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); | 142 | (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); | |
141 | #endif | 143 | #endif | |
142 | } | 144 | } | |
143 | #elif defined(__clang__) | 145 | #elif defined(__clang__) | |
144 | #ifdef __LITTLE_ENDIAN__ | 146 | #ifdef __LITTLE_ENDIAN__ | |
145 | #define vextq_u32(__lo, __hi, __i) \ | 147 | #define vextq_u32(__lo, __hi, __i) \ | |
146 | (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ | 148 | (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ | |
147 | (int8x16_t)(__hi), (__i), 50) | 149 | (int8x16_t)(__hi), (__i), 50) | |
148 | #else | 150 | #else | |
149 | #define vextq_u32(__lo, __hi, __i) ( \ | 151 | #define vextq_u32(__lo, __hi, __i) ( \ | |
150 | { \ | 152 | { \ | |
151 | uint32x4_t __tlo = (__lo); \ | 153 | uint32x4_t __tlo = (__lo); \ | |
152 | uint32x4_t __thi = (__hi); \ | 154 | uint32x4_t __thi = (__hi); \ | |
153 | uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ | 155 | uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ | |
154 | uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ | 156 | uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ | |
155 | uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ | 157 | uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ | |
156 | (int8x16_t)__hi_r, __i, 50); \ | 158 | (int8x16_t)__hi_r, __i, 50); \ | |
157 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | 159 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | |
158 | }) | 160 | }) | |
159 | #endif /* __LITTLE_ENDIAN__ */ | 161 | #endif /* __LITTLE_ENDIAN__ */ | |
160 | #endif | 162 | #endif | |
161 | 163 | |||
162 | #if defined(__GNUC__) && !defined(__clang__) | 164 | #if defined(__GNUC__) && !defined(__clang__) | |
163 | _INTRINSATTR | 165 | _INTRINSATTR | |
164 | static __inline uint8x16_t | 166 | static __inline uint8x16_t | |
165 | vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) | 167 | vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) | |
166 | { | 168 | { | |
167 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | 169 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | |
168 | return __builtin_shuffle(__hi, __lo, | 170 | return __builtin_shuffle(__hi, __lo, | |
169 | (uint8x16_t) { | 171 | (uint8x16_t) { | |
170 | 16 - __i, 17 - __i, 18 - __i, 19 - __i, | 172 | 16 - __i, 17 - __i, 18 - __i, 19 - __i, | |
171 | 20 - __i, 21 - __i, 22 - __i, 23 - __i, | 173 | 20 - __i, 21 - __i, 22 - __i, 23 - __i, | |
172 | 24 - __i, 25 - __i, 26 - __i, 27 - __i, | 174 | 24 - __i, 25 - __i, 26 - __i, 27 - __i, | |
173 | 28 - __i, 29 - __i, 30 - __i, 31 - __i, | 175 | 28 - __i, 29 - __i, 30 - __i, 31 - __i, | |
174 | }); | 176 | }); | |
175 | #else | 177 | #else | |
176 | return __builtin_shuffle(__lo, __hi, | 178 | return __builtin_shuffle(__lo, __hi, | |
177 | (uint8x16_t) { | 179 | (uint8x16_t) { | |
178 | __i + 0, __i + 1, __i + 2, __i + 3, | 180 | __i + 0, __i + 1, __i + 2, __i + 3, | |
179 | __i + 4, __i + 5, __i + 6, __i + 7, | 181 | __i + 4, __i + 5, __i + 6, __i + 7, | |
180 | __i + 8, __i + 9, __i + 10, __i + 11, | 182 | __i + 8, __i + 9, __i + 10, __i + 11, | |
181 | __i + 12, __i + 13, __i + 14, __i + 15, | 183 | __i + 12, __i + 13, __i + 14, __i + 15, | |
182 | }); | 184 | }); | |
183 | #endif | 185 | #endif | |
184 | } | 186 | } | |
185 | #elif defined(__clang__) | 187 | #elif defined(__clang__) | |
186 | #ifdef __LITTLE_ENDIAN__ | 188 | #ifdef __LITTLE_ENDIAN__ | |
187 | #define vextq_u8(__lo, __hi, __i) \ | 189 | #define vextq_u8(__lo, __hi, __i) \ | |
188 | (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ | 190 | (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ | |
189 | (int8x16_t)(__hi), (__i), 48) | 191 | (int8x16_t)(__hi), (__i), 48) | |
190 | #else | 192 | #else | |
191 | #define vextq_u8(__lo, __hi, __i) ( \ | 193 | #define vextq_u8(__lo, __hi, __i) ( \ | |
192 | { \ | 194 | { \ | |
193 | uint8x16_t __tlo = (__lo); \ | 195 | uint8x16_t __tlo = (__lo); \ | |
194 | uint8x16_t __thi = (__hi); \ | 196 | uint8x16_t __thi = (__hi); \ | |
195 | uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ | 197 | uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ | |
196 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | 198 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | |
197 | uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ | 199 | uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ | |
198 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | 200 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | |
199 | uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ | 201 | uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ | |
200 | (int8x16_t)__hi_r, (__i), 48); \ | 202 | (int8x16_t)__hi_r, (__i), 48); \ | |
201 | return __builtin_shufflevector(__r, __r, \ | 203 | return __builtin_shufflevector(__r, __r, \ | |
202 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | 204 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ | |
203 | }) | 205 | }) | |
204 | #endif /* __LITTLE_ENDIAN */ | 206 | #endif /* __LITTLE_ENDIAN */ | |
205 | #endif | 207 | #endif | |
206 | 208 | |||
207 | #if defined(__GNUC__) && !defined(__clang__) | 209 | #if defined(__GNUC__) && !defined(__clang__) | |
208 | _INTRINSATTR | 210 | _INTRINSATTR | |
209 | static __inline uint32_t | 211 | static __inline uint32_t | |
210 | vgetq_lane_u32(uint32x4_t __v, uint8_t __i) | 212 | vgetq_lane_u32(uint32x4_t __v, uint8_t __i) | |
211 | { | 213 | { | |
212 | #ifdef __aarch64__ | 214 | #ifdef __aarch64__ | |
213 | return __v[__i]; | 215 | return __v[__i]; | |
214 | #else | 216 | #else | |
215 | return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); | 217 | return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); | |
216 | #endif | 218 | #endif | |
217 | } | 219 | } | |
218 | #elif defined(__clang__) | 220 | #elif defined(__clang__) | |
219 | #define vgetq_lane_u32(__v, __i) \ | 221 | #define vgetq_lane_u32(__v, __i) \ | |
220 | (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ | 222 | (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ | |
221 | __neon_lane_index(__v, __i)) | 223 | __neon_lane_index(__v, __i)) | |
222 | #endif | 224 | #endif | |
223 | 225 | |||
224 | _INTRINSATTR | 226 | _INTRINSATTR | |
225 | static __inline uint32x4_t | 227 | static __inline uint32x4_t | |
226 | vld1q_u32(const uint32_t *__p32) | 228 | vld1q_u32(const uint32_t *__p32) | |
227 | { | 229 | { | |
228 | #if defined(__GNUC__) && !defined(__clang__) | 230 | #if defined(__GNUC__) && !defined(__clang__) | |
229 | #ifdef __aarch64__ | 231 | #ifdef __aarch64__ | |
230 | const __builtin_aarch64_simd_si *__p = | 232 | const __builtin_aarch64_simd_si *__p = | |
231 | (const __builtin_aarch64_simd_si *)__p32; | 233 | (const __builtin_aarch64_simd_si *)__p32; | |
232 | 234 | |||
233 | return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); | 235 | return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); | |
234 | #else | 236 | #else | |
235 | const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; | 237 | const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; | |
236 | 238 | |||
237 | return (uint32x4_t)__builtin_neon_vld1v4si(__p); | 239 | return (uint32x4_t)__builtin_neon_vld1v4si(__p); | |
238 | #endif | 240 | #endif | |
239 | #elif defined(__clang__) | 241 | #elif defined(__clang__) | |
240 | uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); | 242 | uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); | |
241 | #ifndef __LITTLE_ENDIAN__ | 243 | #ifndef __LITTLE_ENDIAN__ | |
242 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); | 244 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); | |
243 | #endif | 245 | #endif | |
244 | return __v; | 246 | return __v; | |
245 | #endif | 247 | #endif | |
246 | } | 248 | } | |
247 | 249 | |||
248 | _INTRINSATTR | 250 | _INTRINSATTR | |
249 | static __inline uint8x16_t | 251 | static __inline uint8x16_t | |
250 | vld1q_u8(const uint8_t *__p8) | 252 | vld1q_u8(const uint8_t *__p8) | |
251 | { | 253 | { | |
252 | #if defined(__GNUC__) && !defined(__clang__) | 254 | #if defined(__GNUC__) && !defined(__clang__) | |
253 | #ifdef __aarch64__ | 255 | #ifdef __aarch64__ | |
254 | const __builtin_aarch64_simd_qi *__p = | 256 | const __builtin_aarch64_simd_qi *__p = | |
255 | (const __builtin_aarch64_simd_qi *)__p8; | 257 | (const __builtin_aarch64_simd_qi *)__p8; | |
256 | 258 | |||
257 | return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); | 259 | return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); | |
258 | #else | 260 | #else | |
259 | const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; | 261 | const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; | |
260 | 262 | |||
261 | return (uint8x16_t)__builtin_neon_vld1v16qi(__p); | 263 | return (uint8x16_t)__builtin_neon_vld1v16qi(__p); | |
262 | #endif | 264 | #endif | |
263 | #elif defined(__clang__) | 265 | #elif defined(__clang__) | |
264 | uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); | 266 | uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); | |
265 | #ifndef __LITTLE_ENDIAN__ | 267 | #ifndef __LITTLE_ENDIAN__ | |
266 | __v = __builtin_shufflevector(__v, __v, | 268 | __v = __builtin_shufflevector(__v, __v, | |
267 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | 269 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | |
268 | #endif | 270 | #endif | |
269 | return __v; | 271 | return __v; | |
270 | #endif | 272 | #endif | |
271 | } | 273 | } | |
272 | 274 | |||
273 | _INTRINSATTR | 275 | _INTRINSATTR | |
274 | static __inline uint8x16_t | 276 | static __inline uint8x16_t | |
275 | vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) | 277 | vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) | |
276 | { | 278 | { | |
277 | #if defined(__GNUC__) && !defined(__clang__) | 279 | #if defined(__GNUC__) && !defined(__clang__) | |
278 | #ifdef __aarch64__ | 280 | #ifdef __aarch64__ | |
279 | uint8x16_t __res; | 281 | uint8x16_t __res; | |
280 | __asm__("tbl %0.16b, {%1.16b}, %2.16b" | 282 | __asm__("tbl %0.16b, {%1.16b}, %2.16b" | |
281 | : "=w"(__res) : "w"(__tab), "w"(__idx)); | 283 | : "=w"(__res) : "w"(__tab), "w"(__idx)); | |
282 | return __res; | 284 | return __res; | |
283 | #else | 285 | #else | |
284 | /* | 286 | /* | |
285 | * No native ARMv7 NEON instruction for this, so do it via two | 287 | * No native ARMv7 NEON instruction for this, so do it via two | |
286 | * half-width TBLs instead (vtbl2_u8 equivalent). | 288 | * half-width TBLs instead (vtbl2_u8 equivalent). | |
287 | */ | 289 | */ | |
288 | uint64x2_t __tab64 = (uint64x2_t)__tab; | 290 | uint64x2_t __tab64 = (uint64x2_t)__tab; | |
289 | uint8x8_t __tablo = (uint8x8_t)__tab64[0]; | 291 | uint8x8_t __tablo = (uint8x8_t)__tab64[0]; | |
290 | uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; | 292 | uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; | |
291 | uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; | 293 | uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; | |
292 | union { | 294 | union { | |
293 | uint8x8x2_t __u8x8x2; | 295 | uint8x8x2_t __u8x8x2; | |
294 | __builtin_neon_ti __ti; | 296 | __builtin_neon_ti __ti; | |
295 | } __u = { __tab8x8x2 }; | 297 | } __u = { __tab8x8x2 }; | |
296 | uint64x2_t __idx64, __out64; | 298 | uint64x2_t __idx64, __out64; | |
297 | int8x8_t __idxlo, __idxhi, __outlo, __outhi; | 299 | int8x8_t __idxlo, __idxhi, __outlo, __outhi; | |
298 | 300 | |||
299 | __idx64 = (uint64x2_t)__idx; | 301 | __idx64 = (uint64x2_t)__idx; | |
300 | __idxlo = (int8x8_t)__idx64[0]; | 302 | __idxlo = (int8x8_t)__idx64[0]; | |
301 | __idxhi = (int8x8_t)__idx64[1]; | 303 | __idxhi = (int8x8_t)__idx64[1]; | |
302 | __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); | 304 | __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); | |
303 | __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); | 305 | __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); | |
304 | __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; | 306 | __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; | |
305 | 307 | |||
306 | return (uint8x16_t)__out64; | 308 | return (uint8x16_t)__out64; | |
307 | #endif | 309 | #endif | |
308 | #elif defined(__clang__) | 310 | #elif defined(__clang__) | |
309 | #ifdef __LITTLE_ENDIAN__ | 311 | #ifdef __LITTLE_ENDIAN__ | |
310 | return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, | 312 | return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, | |
311 | (int8x16_t)__idx, 48); | 313 | (int8x16_t)__idx, 48); | |
312 | #else | 314 | #else | |
313 | uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, | 315 | uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, | |
314 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | 316 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | |
315 | uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, | 317 | uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, | |
316 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | 318 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | |
317 | uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, | 319 | uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, | |
318 | (int8x16_t)__idx, __i, 48); | 320 | (int8x16_t)__idx, __i, 48); | |
319 | return __builtin_shufflevector(__r, __r, | 321 | return __builtin_shufflevector(__r, __r, | |
320 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | 322 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | |
321 | #endif | 323 | #endif | |
322 | #endif | 324 | #endif | |
323 | } | 325 | } | |
324 | 326 | |||
325 | _INTRINSATTR | 327 | _INTRINSATTR | |
326 | static __inline int32x4_t | 328 | static __inline int32x4_t | |
327 | vreinterpretq_s32_u8(uint8x16_t __v) | 329 | vreinterpretq_s32_u8(uint8x16_t __v) | |
328 | { | 330 | { | |
329 | return (int32x4_t)__v; | 331 | return (int32x4_t)__v; | |
330 | } | 332 | } | |
331 | 333 | |||
332 | _INTRINSATTR | 334 | _INTRINSATTR | |
333 | static __inline uint32x4_t | 335 | static __inline uint32x4_t | |
334 | vreinterpretq_u32_u8(uint8x16_t __v) | 336 | vreinterpretq_u32_u8(uint8x16_t __v) | |
335 | { | 337 | { | |
336 | return (uint32x4_t)__v; | 338 | return (uint32x4_t)__v; | |
337 | } | 339 | } | |
338 | 340 | |||
339 | _INTRINSATTR | 341 | _INTRINSATTR | |
340 | static __inline uint64x2_t | 342 | static __inline uint64x2_t | |
341 | vreinterpretq_u64_u8(uint8x16_t __v) | 343 | vreinterpretq_u64_u8(uint8x16_t __v) | |
342 | { | 344 | { | |
343 | return (uint64x2_t)__v; | 345 | return (uint64x2_t)__v; | |
344 | } | 346 | } | |
345 | 347 | |||
346 | _INTRINSATTR | 348 | _INTRINSATTR | |
347 | static __inline uint8x16_t | 349 | static __inline uint8x16_t | |
348 | vreinterpretq_u8_s32(int32x4_t __v) | 350 | vreinterpretq_u8_s32(int32x4_t __v) | |
349 | { | 351 | { | |
350 | return (uint8x16_t)__v; | 352 | return (uint8x16_t)__v; | |
351 | } | 353 | } | |
352 | 354 | |||
353 | _INTRINSATTR | 355 | _INTRINSATTR | |
354 | static __inline uint8x16_t | 356 | static __inline uint8x16_t | |
355 | vreinterpretq_u8_u32(uint32x4_t __v) | 357 | vreinterpretq_u8_u32(uint32x4_t __v) | |
356 | { | 358 | { | |
357 | return (uint8x16_t)__v; | 359 | return (uint8x16_t)__v; | |
358 | } | 360 | } | |
359 | 361 | |||
360 | _INTRINSATTR | 362 | _INTRINSATTR | |
361 | static __inline uint8x16_t | 363 | static __inline uint8x16_t | |
362 | vreinterpretq_u8_u64(uint64x2_t __v) | 364 | vreinterpretq_u8_u64(uint64x2_t __v) | |
363 | { | 365 | { | |
364 | return (uint8x16_t)__v; | 366 | return (uint8x16_t)__v; | |
365 | } | 367 | } | |
366 | 368 | |||
367 | _INTRINSATTR | 369 | _INTRINSATTR | |
368 | static __inline uint8x16_t | 370 | static __inline uint8x16_t | |
369 | vrev32q_u8(uint8x16_t __v) | 371 | vrev32q_u8(uint8x16_t __v) | |
370 | { | 372 | { | |
371 | #if defined(__GNUC__) && !defined(__clang__) | 373 | #if defined(__GNUC__) && !defined(__clang__) | |
372 | return __builtin_shuffle(__v, | 374 | return __builtin_shuffle(__v, | |
373 | (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); | 375 | (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); | |
374 | #elif defined(__clang__) | 376 | #elif defined(__clang__) | |
375 | return __builtin_shufflevector(__v, | 377 | return __builtin_shufflevector(__v, | |
376 | 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); | 378 | 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); | |
377 | #endif | 379 | #endif | |
378 | } | 380 | } | |
379 | 381 | |||
380 | #if defined(__GNUC__) && !defined(__clang__) | 382 | #if defined(__GNUC__) && !defined(__clang__) | |
381 | _INTRINSATTR | 383 | _INTRINSATTR | |
382 | static __inline uint32x4_t | 384 | static __inline uint32x4_t | |
383 | vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) | 385 | vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) | |
384 | { | 386 | { | |
385 | __v[__neon_lane_index(__v, __i)] = __x; | 387 | __v[__neon_lane_index(__v, __i)] = __x; | |
386 | return __v; | 388 | return __v; | |
387 | } | 389 | } | |
388 | #elif defined(__clang__) | 390 | #elif defined(__clang__) | |
389 | #define vsetq_lane_u32(__x, __v, __i) \ | 391 | #define vsetq_lane_u32(__x, __v, __i) \ | |
390 | (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ | 392 | (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ | |
391 | __neon_lane_index(__v, __i)) | 393 | __neon_lane_index(__v, __i)) | |
392 | #endif | 394 | #endif | |
393 | 395 | |||
394 | #if defined(__GNUC__) && !defined(__clang__) | 396 | #if defined(__GNUC__) && !defined(__clang__) | |
395 | _INTRINSATTR | 397 | _INTRINSATTR | |
396 | static __inline uint64x2_t | 398 | static __inline uint64x2_t | |
397 | vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) | 399 | vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) | |
398 | { | 400 | { | |
399 | __v[__neon_lane_index(__v, __i)] = __x; | 401 | __v[__neon_lane_index(__v, __i)] = __x; | |
400 | return __v; | 402 | return __v; | |
401 | } | 403 | } | |
402 | #elif defined(__clang__) | 404 | #elif defined(__clang__) | |
403 | #define vsetq_lane_u64(__x, __v, __i) \ | 405 | #define vsetq_lane_u64(__x, __v, __i) \ | |
404 | (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ | 406 | (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ | |
405 | __neon_lane_index(__v, __i)); | 407 | __neon_lane_index(__v, __i)); | |
406 | #endif | 408 | #endif | |
407 | 409 | |||
408 | #if defined(__GNUC__) && !defined(__clang__) | 410 | #if defined(__GNUC__) && !defined(__clang__) | |
409 | _INTRINSATTR | 411 | _INTRINSATTR | |
410 | static __inline uint32x4_t | 412 | static __inline uint32x4_t | |
411 | vshlq_n_u32(uint32x4_t __v, uint8_t __bits) | 413 | vshlq_n_u32(uint32x4_t __v, uint8_t __bits) | |
412 | { | 414 | { | |
413 | #ifdef __aarch64__ | 415 | #ifdef __aarch64__ | |
414 | return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); | 416 | return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); | |
415 | #else | 417 | #else | |
416 | return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); | 418 | return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); | |
417 | #endif | 419 | #endif | |
418 | } | 420 | } | |
419 | #elif defined(__clang__) | 421 | #elif defined(__clang__) | |
420 | #define vshlq_n_u32(__v, __bits) \ | 422 | #define vshlq_n_u32(__v, __bits) \ | |
421 | (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) | 423 | (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) | |
422 | #endif | 424 | #endif | |
423 | 425 | |||
424 | #if defined(__GNUC__) && !defined(__clang__) | 426 | #if defined(__GNUC__) && !defined(__clang__) | |
425 | _INTRINSATTR | 427 | _INTRINSATTR | |
426 | static __inline uint32x4_t | 428 | static __inline uint32x4_t | |
427 | vshrq_n_u32(uint32x4_t __v, uint8_t __bits) | 429 | vshrq_n_u32(uint32x4_t __v, uint8_t __bits) | |
428 | { | 430 | { | |
429 | #ifdef __aarch64__ | 431 | #ifdef __aarch64__ | |
430 | return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); | 432 | return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); | |
431 | #else | 433 | #else | |
432 | return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); | 434 | return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); | |
433 | #endif | 435 | #endif | |
434 | } | 436 | } | |
435 | #elif defined(__clang__) | 437 | #elif defined(__clang__) | |
436 | #define vshrq_n_u8(__v, __bits) \ | 438 | #define vshrq_n_u8(__v, __bits) \ | |
437 | (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) | 439 | (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) | |
438 | #endif | 440 | #endif | |
439 | 441 | |||
440 | #if defined(__GNUC__) && !defined(__clang__) | 442 | #if defined(__GNUC__) && !defined(__clang__) | |
441 | _INTRINSATTR | 443 | _INTRINSATTR | |
442 | static __inline uint8x16_t | 444 | static __inline uint8x16_t | |
443 | vshrq_n_u8(uint8x16_t __v, uint8_t __bits) | 445 | vshrq_n_u8(uint8x16_t __v, uint8_t __bits) | |
444 | { | 446 | { | |
445 | #ifdef __aarch64__ | 447 | #ifdef __aarch64__ | |
446 | return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); | 448 | return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); | |
447 | #else | 449 | #else | |
448 | return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); | 450 | return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); | |
449 | #endif | 451 | #endif | |
450 | } | 452 | } | |
451 | #elif defined(__clang__) | 453 | #elif defined(__clang__) | |
452 | #define vshrq_n_u8(__v, __bits) \ | 454 | #define vshrq_n_u8(__v, __bits) \ | |
453 | (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) | 455 | (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) | |
454 | #endif | 456 | #endif | |
455 | 457 | |||
456 | #if defined(__GNUC__) && !defined(__clang__) | 458 | #if defined(__GNUC__) && !defined(__clang__) | |
457 | _INTRINSATTR | 459 | _INTRINSATTR | |
458 | static __inline int32x4_t | 460 | static __inline int32x4_t | |
459 | vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) | 461 | vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) | |
460 | { | 462 | { | |
461 | #ifdef __aarch64__ | 463 | #ifdef __aarch64__ | |
462 | return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); | 464 | return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); | |
463 | #else | 465 | #else | |
464 | return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); | 466 | return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); | |
465 | #endif | 467 | #endif | |
466 | } | 468 | } | |
467 | #elif defined(__clang__) | 469 | #elif defined(__clang__) | |
468 | #ifdef __LITTLE_ENDIAN__ | 470 | #ifdef __LITTLE_ENDIAN__ | |
469 | #define vsliq_n_s32(__vins, __vsh, __bits) \ | 471 | #define vsliq_n_s32(__vins, __vsh, __bits) \ | |
470 | (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ | 472 | (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ | |
471 | (int32x4_t)(__vsh), (__bits), 34) | 473 | (int32x4_t)(__vsh), (__bits), 34) | |
472 | #else | 474 | #else | |
473 | #define vsliq_n_s32(__vins, __vsh, __bits) ( \ | 475 | #define vsliq_n_s32(__vins, __vsh, __bits) ( \ | |
474 | { \ | 476 | { \ | |
475 | int32x4_t __tvins = (__vins); \ | 477 | int32x4_t __tvins = (__vins); \ | |
476 | int32x4_t __tvsh = (__vsh); \ | 478 | int32x4_t __tvsh = (__vsh); \ | |
477 | uint8_t __tbits = (__bits); \ | 479 | uint8_t __tbits = (__bits); \ | |
478 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ | 480 | int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ | |
479 | 3,2,1,0); \ | 481 | 3,2,1,0); \ | |
480 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ | 482 | int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ | |
481 | 3,2,1,0); \ | 483 | 3,2,1,0); \ | |
482 | int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ | 484 | int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ | |
483 | 34); \ | 485 | 34); \ | |
484 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | 486 | __builtin_shufflevector(__r, __r, 3,2,1,0); \ | |
485 | }) | 487 | }) | |
486 | #endif /* __LITTLE_ENDIAN__ */ | 488 | #endif /* __LITTLE_ENDIAN__ */ | |
487 | #endif | 489 | #endif | |
488 | 490 | |||
489 | _INTRINSATTR | 491 | _INTRINSATTR | |
490 | static __inline void | 492 | static __inline void | |
491 | vst1q_u32(uint32_t *__p32, uint32x4_t __v) | 493 | vst1q_u32(uint32_t *__p32, uint32x4_t __v) | |
492 | { | 494 | { | |
493 | #if defined(__GNUC__) && !defined(__clang__) | 495 | #if defined(__GNUC__) && !defined(__clang__) | |
494 | #ifdef __aarch64__ | 496 | #ifdef __aarch64__ | |
495 | __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; | 497 | __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; | |
496 | 498 | |||
497 | __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); | 499 | __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); | |
498 | #else | 500 | #else | |
499 | __builtin_neon_si *__p = (__builtin_neon_si *)__p32; | 501 | __builtin_neon_si *__p = (__builtin_neon_si *)__p32; | |
500 | 502 | |||
501 | __builtin_neon_vst1v4si(__p, (int32x4_t)__v); | 503 | __builtin_neon_vst1v4si(__p, (int32x4_t)__v); | |
502 | #endif | 504 | #endif | |
503 | #elif defined(__clang__) | 505 | #elif defined(__clang__) | |
504 | #ifndef __LITTLE_ENDIAN__ | 506 | #ifndef __LITTLE_ENDIAN__ | |
505 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); | 507 | __v = __builtin_shufflevector(__v, __v, 3,2,1,0); | |
506 | #endif | 508 | #endif | |
507 | __builtin_neon_vst1q_v(__p32, __v, 50); | 509 | __builtin_neon_vst1q_v(__p32, __v, 50); | |
508 | #endif | 510 | #endif | |
509 | } | 511 | } | |
510 | 512 | |||
511 | _INTRINSATTR | 513 | _INTRINSATTR | |
512 | static __inline void | 514 | static __inline void | |
513 | vst1q_u8(uint8_t *__p8, uint8x16_t __v) | 515 | vst1q_u8(uint8_t *__p8, uint8x16_t __v) | |
514 | { | 516 | { | |
515 | #if defined(__GNUC__) && !defined(__clang__) | 517 | #if defined(__GNUC__) && !defined(__clang__) | |
516 | #ifdef __aarch64__ | 518 | #ifdef __aarch64__ | |
517 | __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; | 519 | __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; | |
518 | 520 | |||
519 | __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); | 521 | __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); | |
520 | #else | 522 | #else | |
521 | __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; | 523 | __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; | |
522 | 524 | |||
523 | __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); | 525 | __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); | |
524 | #endif | 526 | #endif | |
525 | #elif defined(__clang__) | 527 | #elif defined(__clang__) | |
526 | #ifndef __LITTLE_ENDIAN__ | 528 | #ifndef __LITTLE_ENDIAN__ | |
527 | __v = __builtin_shufflevector(__v, __v, | 529 | __v = __builtin_shufflevector(__v, __v, | |
528 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | 530 | 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); | |
529 | #endif | 531 | #endif | |
530 | __builtin_neon_vst1q_v(__p8, __v, 48); | 532 | __builtin_neon_vst1q_v(__p8, __v, 48); | |
531 | #endif | 533 | #endif | |
532 | } | 534 | } | |
533 | 535 | |||
534 | #endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */ | 536 | #endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */ |