Tue Jul 28 20:11:09 2020 UTC ()
Draft 2x vectorized neon vpaes for aarch64.

Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in
cgd tests, for parallelizable operations like CBC decryption; same
improvement should probably carry over to rpi4 CPU which lacks
ARMv8.0-AES.


(riastradh)
diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c
diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h
diff -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h

cvs diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c (switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_neon.c 2020/06/30 20:32:11 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon.c 2020/07/28 20:11:09 1.4
@@ -1,631 +1,740 @@ @@ -1,631 +1,740 @@
1/* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */ 1/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29/* 29/*
30 * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES 30 * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
31 * software, at <https://crypto.stanford.edu/vpaes/>, described in 31 * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 * 32 *
33 * Mike Hamburg, `Accelerating AES with Vector Permute 33 * Mike Hamburg, `Accelerating AES with Vector Permute
34 * Instructions', in Christophe Clavier and Kris Gaj (eds.), 34 * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 * Cryptographic Hardware and Embedded Systems -- CHES 2009, 35 * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 * Springer LNCS 5747, pp. 18-32. 36 * Springer LNCS 5747, pp. 18-32.
37 * 37 *
38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 */ 39 */
40 40
41#include <sys/cdefs.h> 41#include <sys/cdefs.h>
42__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $"); 42__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
43 43
44#include <sys/types.h> 44#include <sys/types.h>
45 45
46#ifdef _KERNEL 46#ifdef _KERNEL
47#include <sys/systm.h> 47#include <sys/systm.h>
48#else 48#else
49#include <err.h> 49#include <err.h>
50#define panic(fmt, args...) err(1, fmt, ##args) 50#define panic(fmt, args...) err(1, fmt, ##args)
51#endif 51#endif
52 52
53#include "aes_neon_impl.h" 53#include "aes_neon_impl.h"
54 54
55#ifdef __aarch64__ 55#ifdef __aarch64__
56#define __aarch64_used 56#define __aarch64_used
57#else 57#else
58#define __aarch64_used __unused 58#define __aarch64_used __unused
59#endif 59#endif
60 60
61static const uint8x16_t 61static const uint8x16_t
62mc_forward[4] = { 62mc_forward[4] = {
63 {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04, 63 {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
64 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C}, 64 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
65 {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08, 65 {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
66 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00}, 66 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
67 {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C, 67 {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
68 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04}, 68 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
69 {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00, 69 {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
70 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08}, 70 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
71}, 71},
72mc_backward[4] __aarch64_used = { 72mc_backward[4] __aarch64_used = {
73 {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06, 73 {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
74 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E}, 74 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
75 {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02, 75 {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
76 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A}, 76 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
77 {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E, 77 {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
78 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06}, 78 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
79 {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A, 79 {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
80 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02}, 80 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
81}, 81},
82ipt[2] __aarch64_used = { 82ipt[2] __aarch64_used = {
83 {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2, 83 {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
84 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA}, 84 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
85 {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C, 85 {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
86 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD}, 86 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
87}, 87},
88opt[2] = { 88opt[2] = {
89 {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF, 89 {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
90 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7}, 90 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
91 {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01, 91 {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
92 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1}, 92 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
93}, 93},
94dipt[2] __aarch64_used = { 94dipt[2] __aarch64_used = {
95 {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F, 95 {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
96 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15}, 96 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
97 {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86, 97 {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
98 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12}, 98 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
99}, 99},
100sb1[2] __aarch64_used = { 100sb1[2] __aarch64_used = {
101 {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1, 101 {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
102 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5}, 102 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
103 {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36, 103 {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
104 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B}, 104 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
105}, 105},
106sb2[2] __aarch64_used = { 106sb2[2] __aarch64_used = {
107 {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2, 107 {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
108 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E}, 108 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
109 {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69, 109 {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
110 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2}, 110 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
111}, 111},
112sbo[2] __aarch64_used = { 112sbo[2] __aarch64_used = {
113 {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0, 113 {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
114 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15}, 114 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
115 {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF, 115 {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
116 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E}, 116 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
117}, 117},
118dsb9[2] __aarch64_used = { 118dsb9[2] __aarch64_used = {
119 {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85, 119 {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
120 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA}, 120 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
121 {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0, 121 {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
122 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72}, 122 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
123}, 123},
124dsbd[2] __aarch64_used = { 124dsbd[2] __aarch64_used = {
125 {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D, 125 {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
126 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5}, 126 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
127 {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C, 127 {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
128 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29}, 128 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
129}, 129},
130dsbb[2] __aarch64_used = { 130dsbb[2] __aarch64_used = {
131 {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0, 131 {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
132 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60}, 132 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
133 {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1, 133 {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
134 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3}, 134 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
135}, 135},
136dsbe[2] __aarch64_used = { 136dsbe[2] __aarch64_used = {
137 {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46, 137 {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
138 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22}, 138 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
139 {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C, 139 {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
140 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94}, 140 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
141}, 141},
142dsbo[2] __aarch64_used = { 142dsbo[2] __aarch64_used = {
143 {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13, 143 {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
144 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7}, 144 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
145 {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12, 145 {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
146 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA}, 146 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
147}, 147},
148dks1[2] = { 148dks1[2] = {
149 {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6, 149 {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
150 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A}, 150 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
151 {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45, 151 {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
152 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B}, 152 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
153}, 153},
154dks2[2] = { 154dks2[2] = {
155 {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27, 155 {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
156 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46}, 156 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
157 {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81, 157 {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
158 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73}, 158 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
159}, 159},
160dks3[2] = { 160dks3[2] = {
161 {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03, 161 {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
162 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8}, 162 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
163 {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE, 163 {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
164 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5}, 164 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
165}, 165},
166dks4[2] = { 166dks4[2] = {
167 {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3, 167 {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
168 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0}, 168 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
169 {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0, 169 {0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
170 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F}, 170 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F},
171}, 171},
172deskew[2] = { 172deskew[2] = {
173 {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07, 173 {0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
174 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D}, 174 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D},
175 {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F, 175 {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
176 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28}, 176 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
177}, 177},
178sr[4] __aarch64_used = { 178sr[4] __aarch64_used = {
179 {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 179 {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
180 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}, 180 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
181 {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03, 181 {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
182 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B}, 182 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B},
183 {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F, 183 {0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
184 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07}, 184 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07},
185 {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B, 185 {0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
186 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03}, 186 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03},
187}, 187},
188rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F, 188rcon = {0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
189 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70}, 189 0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70},
190s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B, 190s63 = {0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
191 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B}, 191 0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B},
192of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F, 192of = {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
193 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F}, 193 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
194inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E, 194inv = {0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
195 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04}, 195 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04},
196inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01, 196inva = {0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
197 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03}; 197 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03};
198 198
199static inline uint8x16_t 199static inline uint8x16_t
200loadroundkey(const void *rkp) 200loadroundkey(const void *rkp)
201{ 201{
202 return vld1q_u8(rkp); 202 return vld1q_u8(rkp);
203} 203}
204 204
205static inline void 205static inline void
206storeroundkey(void *rkp, uint8x16_t rk) 206storeroundkey(void *rkp, uint8x16_t rk)
207{ 207{
208 vst1q_u8(rkp, rk); 208 vst1q_u8(rkp, rk);
209} 209}
210 210
211/* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */ 211/* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g. */
212static inline void 212static inline void
213bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x) 213bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
214{ 214{
215 215
216 *lo = of & x; 216 *lo = of & x;
217 *hi = of & vshrq_n_u8(x, 4); 217 *hi = of & vshrq_n_u8(x, 4);
218} 218}
219 219
220/* 220/*
221 * t is a pair of maps respectively from low and high nybbles to bytes. 221 * t is a pair of maps respectively from low and high nybbles to bytes.
222 * Apply t the nybbles, and add the results in GF(2). 222 * Apply t the nybbles, and add the results in GF(2).
223 */ 223 */
224static uint8x16_t 224static uint8x16_t
225aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2]) 225aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
226{ 226{
227 uint8x16_t lo, hi; 227 uint8x16_t lo, hi;
228 228
229 bytes2nybbles(&lo, &hi, x); 229 bytes2nybbles(&lo, &hi, x);
230 return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi); 230 return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
231} 231}
232 232
233static inline void 233static inline void
234subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_, 234subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
235 uint8x16_t inva_) 235 uint8x16_t inva_)
236{ 236{
237 uint8x16_t k, i, ak, j; 237 uint8x16_t k, i, ak, j;
238 238
239 bytes2nybbles(&k, &i, x); 239 bytes2nybbles(&k, &i, x);
240 ak = vqtbl1q_u8(inva_, k); 240 ak = vqtbl1q_u8(inva_, k);
241 j = i ^ k; 241 j = i ^ k;
242 *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i)); 242 *io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
243 *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j)); 243 *jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
244} 244}
245 245
246static uint8x16_t 246static uint8x16_t
247aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk) 247aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
248{ 248{
249 uint8x16_t io, jo; 249 uint8x16_t io, jo;
250 250
251 /* smear prk */ 251 /* smear prk */
252 prk ^= vextq_u8(vdupq_n_u8(0), prk, 12); 252 prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
253 prk ^= vextq_u8(vdupq_n_u8(0), prk, 8); 253 prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
254 prk ^= s63; 254 prk ^= s63;
255 255
256 /* subbytes */ 256 /* subbytes */
257 subbytes(&io, &jo, rk, inv, inva); 257 subbytes(&io, &jo, rk, inv, inva);
258 rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo); 258 rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
259 259
260 /* add in smeared stuff */ 260 /* add in smeared stuff */
261 return rk ^ prk; 261 return rk ^ prk;
262} 262}
263 263
264static uint8x16_t 264static uint8x16_t
265aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot) 265aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
266{ 266{
267 uint32x4_t rk32; 267 uint32x4_t rk32;
268 268
269 /* extract rcon from rcon_rot */ 269 /* extract rcon from rcon_rot */
270 prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15); 270 prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
271 *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15); 271 *rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
272 272
273 /* rotate */ 273 /* rotate */
274 rk32 = vreinterpretq_u32_u8(rk); 274 rk32 = vreinterpretq_u32_u8(rk);
275 rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3)); 275 rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
276 rk = vreinterpretq_u8_u32(rk32); 276 rk = vreinterpretq_u8_u32(rk32);
277 rk = vextq_u8(rk, rk, 1); 277 rk = vextq_u8(rk, rk, 1);
278 278
279 return aes_schedule_low_round(rk, prk); 279 return aes_schedule_low_round(rk, prk);
280} 280}
281 281
282static uint8x16_t 282static uint8x16_t
283aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i) 283aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
284{ 284{
285 uint8x16_t y = vdupq_n_u8(0); 285 uint8x16_t y = vdupq_n_u8(0);
286 286
287 x ^= s63; 287 x ^= s63;
288 288
289 x = vqtbl1q_u8(x, mc_forward[0]); 289 x = vqtbl1q_u8(x, mc_forward[0]);
290 y ^= x; 290 y ^= x;
291 x = vqtbl1q_u8(x, mc_forward[0]); 291 x = vqtbl1q_u8(x, mc_forward[0]);
292 y ^= x; 292 y ^= x;
293 x = vqtbl1q_u8(x, mc_forward[0]); 293 x = vqtbl1q_u8(x, mc_forward[0]);
294 y ^= x; 294 y ^= x;
295 295
296 return vqtbl1q_u8(y, sr_i); 296 return vqtbl1q_u8(y, sr_i);
297} 297}
298 298
299static uint8x16_t 299static uint8x16_t
300aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i) 300aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
301{ 301{
302 302
303 return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt); 303 return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
304} 304}
305 305
306static uint8x16_t 306static uint8x16_t
307aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i) 307aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
308{ 308{
309 uint8x16_t y = vdupq_n_u8(0); 309 uint8x16_t y = vdupq_n_u8(0);
310 310
311 x = aes_schedule_transform(x, dks1); 311 x = aes_schedule_transform(x, dks1);
312 y = vqtbl1q_u8(y ^ x, mc_forward[0]); 312 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
313 x = aes_schedule_transform(x, dks2); 313 x = aes_schedule_transform(x, dks2);
314 y = vqtbl1q_u8(y ^ x, mc_forward[0]); 314 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
315 x = aes_schedule_transform(x, dks3); 315 x = aes_schedule_transform(x, dks3);
316 y = vqtbl1q_u8(y ^ x, mc_forward[0]); 316 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
317 x = aes_schedule_transform(x, dks4); 317 x = aes_schedule_transform(x, dks4);
318 y = vqtbl1q_u8(y ^ x, mc_forward[0]); 318 y = vqtbl1q_u8(y ^ x, mc_forward[0]);
319 319
320 return vqtbl1q_u8(y, sr_i); 320 return vqtbl1q_u8(y, sr_i);
321} 321}
322 322
323static uint8x16_t 323static uint8x16_t
324aes_schedule_mangle_last_dec(uint8x16_t x) 324aes_schedule_mangle_last_dec(uint8x16_t x)
325{ 325{
326 326
327 return aes_schedule_transform(x ^ s63, deskew); 327 return aes_schedule_transform(x ^ s63, deskew);
328} 328}
329 329
330static uint8x16_t 330static uint8x16_t
331aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk) 331aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
332{ 332{
333 uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi); 333 uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
334 uint32x4_t prk32 = vreinterpretq_u32_u8(prk); 334 uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
335 uint32x4_t rk32; 335 uint32x4_t rk32;
336 336
337 rk32 = prkhi32; 337 rk32 = prkhi32;
338 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2), 338 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
339 vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)), 339 vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
340 3); 340 3);
341 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2), 341 rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
342 vdupq_n_u32(vgetq_lane_u32(prk32, 3)), 342 vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
343 0); 343 0);
344 344
345 return vreinterpretq_u8_u32(rk32); 345 return vreinterpretq_u8_u32(rk32);
346} 346}
347 347
348static uint8x16_t 348static uint8x16_t
349aes_schedule_192_smearhi(uint8x16_t rk) 349aes_schedule_192_smearhi(uint8x16_t rk)
350{ 350{
351 uint64x2_t rk64 = vreinterpretq_u64_u8(rk); 351 uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
352 352
353 rk64 = vsetq_lane_u64(0, rk64, 0); 353 rk64 = vsetq_lane_u64(0, rk64, 0);
354 354
355 return vreinterpretq_u8_u64(rk64); 355 return vreinterpretq_u8_u64(rk64);
356} 356}
357 357
358void 358void
359aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds) 359aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
360{ 360{
361 uint32_t *rk32 = enc->aese_aes.aes_rk; 361 uint32_t *rk32 = enc->aese_aes.aes_rk;
362 uint8x16_t mrk; /* mangled round key */ 362 uint8x16_t mrk; /* mangled round key */
363 uint8x16_t rk; /* round key */ 363 uint8x16_t rk; /* round key */
364 uint8x16_t prk; /* previous round key */ 364 uint8x16_t prk; /* previous round key */
365 uint8x16_t rcon_rot = rcon; 365 uint8x16_t rcon_rot = rcon;
366 uint64_t i = 3; 366 uint64_t i = 3;
367 367
368 /* input transform */ 368 /* input transform */
369 rk = aes_schedule_transform(vld1q_u8(key), ipt); 369 rk = aes_schedule_transform(vld1q_u8(key), ipt);
370 storeroundkey(rk32, rk); 370 storeroundkey(rk32, rk);
371 rk32 += 4; 371 rk32 += 4;
372 372
373 switch (nrounds) { 373 switch (nrounds) {
374 case 10: 374 case 10:
375 for (;;) { 375 for (;;) {
376 rk = aes_schedule_round(rk, rk, &rcon_rot); 376 rk = aes_schedule_round(rk, rk, &rcon_rot);
377 if (--nrounds == 0) 377 if (--nrounds == 0)
378 break; 378 break;
379 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 379 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
380 storeroundkey(rk32, mrk); 380 storeroundkey(rk32, mrk);
381 rk32 += 4; 381 rk32 += 4;
382 } 382 }
383 break; 383 break;
384 case 12: { 384 case 12: {
385 uint8x16_t prkhi; /* high half of previous round key */ 385 uint8x16_t prkhi; /* high half of previous round key */
386 386
387 prk = rk; 387 prk = rk;
388 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); 388 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
389 prkhi = aes_schedule_192_smearhi(rk); 389 prkhi = aes_schedule_192_smearhi(rk);
390 for (;;) { 390 for (;;) {
391 prk = aes_schedule_round(rk, prk, &rcon_rot); 391 prk = aes_schedule_round(rk, prk, &rcon_rot);
392 rk = vextq_u8(prkhi, prk, 8); 392 rk = vextq_u8(prkhi, prk, 8);
393 393
394 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 394 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
395 storeroundkey(rk32, mrk); 395 storeroundkey(rk32, mrk);
396 rk32 += 4; 396 rk32 += 4;
397 rk = aes_schedule_192_smear(prkhi, prk); 397 rk = aes_schedule_192_smear(prkhi, prk);
398 prkhi = aes_schedule_192_smearhi(rk); 398 prkhi = aes_schedule_192_smearhi(rk);
399 399
400 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 400 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
401 storeroundkey(rk32, mrk); 401 storeroundkey(rk32, mrk);
402 rk32 += 4; 402 rk32 += 4;
403 rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 403 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
404 if ((nrounds -= 3) == 0) 404 if ((nrounds -= 3) == 0)
405 break; 405 break;
406 406
407 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 407 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
408 storeroundkey(rk32, mrk); 408 storeroundkey(rk32, mrk);
409 rk32 += 4; 409 rk32 += 4;
410 rk = aes_schedule_192_smear(prkhi, prk); 410 rk = aes_schedule_192_smear(prkhi, prk);
411 prkhi = aes_schedule_192_smearhi(rk); 411 prkhi = aes_schedule_192_smearhi(rk);
412 } 412 }
413 break; 413 break;
414 } 414 }
415 case 14: { 415 case 14: {
416 uint8x16_t pprk; /* previous previous round key */ 416 uint8x16_t pprk; /* previous previous round key */
417 417
418 prk = rk; 418 prk = rk;
419 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); 419 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
420 for (;;) { 420 for (;;) {
421 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 421 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
422 storeroundkey(rk32, mrk); 422 storeroundkey(rk32, mrk);
423 rk32 += 4; 423 rk32 += 4;
424 pprk = rk; 424 pprk = rk;
425 425
426 /* high round */ 426 /* high round */
427 rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 427 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
428 if ((nrounds -= 2) == 0) 428 if ((nrounds -= 2) == 0)
429 break; 429 break;
430 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]); 430 mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
431 storeroundkey(rk32, mrk); 431 storeroundkey(rk32, mrk);
432 rk32 += 4; 432 rk32 += 4;
433 433
434 /* low round */ 434 /* low round */
435 rk = vreinterpretq_u8_u32( 435 rk = vreinterpretq_u8_u32(
436 vdupq_n_u32( 436 vdupq_n_u32(
437 vgetq_lane_u32(vreinterpretq_u32_u8(rk), 437 vgetq_lane_u32(vreinterpretq_u32_u8(rk),
438 3))); 438 3)));
439 rk = aes_schedule_low_round(rk, pprk); 439 rk = aes_schedule_low_round(rk, pprk);
440 } 440 }
441 break; 441 break;
442 } 442 }
443 default: 443 default:
444 panic("invalid number of AES rounds: %u", nrounds); 444 panic("invalid number of AES rounds: %u", nrounds);
445 } 445 }
446 storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4])); 446 storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
447} 447}
448 448
449void 449void
450aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds) 450aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
451{ 451{
452 uint32_t *rk32 = dec->aesd_aes.aes_rk; 452 uint32_t *rk32 = dec->aesd_aes.aes_rk;
453 uint8x16_t mrk; /* mangled round key */ 453 uint8x16_t mrk; /* mangled round key */
454 uint8x16_t ork; /* original round key */ 454 uint8x16_t ork; /* original round key */
455 uint8x16_t rk; /* round key */ 455 uint8x16_t rk; /* round key */
456 uint8x16_t prk; /* previous round key */ 456 uint8x16_t prk; /* previous round key */
457 uint8x16_t rcon_rot = rcon; 457 uint8x16_t rcon_rot = rcon;
458 unsigned i = nrounds == 12 ? 0 : 2; 458 unsigned i = nrounds == 12 ? 0 : 2;
459 459
460 ork = vld1q_u8(key); 460 ork = vld1q_u8(key);
461 461
462 /* input transform */ 462 /* input transform */
463 rk = aes_schedule_transform(ork, ipt); 463 rk = aes_schedule_transform(ork, ipt);
464 464
465 /* go from end */ 465 /* go from end */
466 rk32 += 4*nrounds; 466 rk32 += 4*nrounds;
467 storeroundkey(rk32, vqtbl1q_u8(ork, sr[i])); 467 storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
468 rk32 -= 4; 468 rk32 -= 4;
469 i ^= 3; 469 i ^= 3;
470 470
471 switch (nrounds) { 471 switch (nrounds) {
472 case 10: 472 case 10:
473 for (;;) { 473 for (;;) {
474 rk = aes_schedule_round(rk, rk, &rcon_rot); 474 rk = aes_schedule_round(rk, rk, &rcon_rot);
475 if (--nrounds == 0) 475 if (--nrounds == 0)
476 break; 476 break;
477 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 477 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
478 storeroundkey(rk32, mrk); 478 storeroundkey(rk32, mrk);
479 rk32 -= 4; 479 rk32 -= 4;
480 } 480 }
481 break; 481 break;
482 case 12: { 482 case 12: {
483 uint8x16_t prkhi; /* high half of previous round key */ 483 uint8x16_t prkhi; /* high half of previous round key */
484 484
485 prk = rk; 485 prk = rk;
486 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt); 486 rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
487 prkhi = aes_schedule_192_smearhi(rk); 487 prkhi = aes_schedule_192_smearhi(rk);
488 for (;;) { 488 for (;;) {
489 prk = aes_schedule_round(rk, prk, &rcon_rot); 489 prk = aes_schedule_round(rk, prk, &rcon_rot);
490 rk = vextq_u8(prkhi, prk, 8); 490 rk = vextq_u8(prkhi, prk, 8);
491 491
492 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 492 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
493 storeroundkey(rk32, mrk); 493 storeroundkey(rk32, mrk);
494 rk32 -= 4; 494 rk32 -= 4;
495 rk = aes_schedule_192_smear(prkhi, prk); 495 rk = aes_schedule_192_smear(prkhi, prk);
496 prkhi = aes_schedule_192_smearhi(rk); 496 prkhi = aes_schedule_192_smearhi(rk);
497 497
498 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 498 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
499 storeroundkey(rk32, mrk); 499 storeroundkey(rk32, mrk);
500 rk32 -= 4; 500 rk32 -= 4;
501 rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 501 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
502 if ((nrounds -= 3) == 0) 502 if ((nrounds -= 3) == 0)
503 break; 503 break;
504 504
505 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 505 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
506 storeroundkey(rk32, mrk); 506 storeroundkey(rk32, mrk);
507 rk32 -= 4; 507 rk32 -= 4;
508 rk = aes_schedule_192_smear(prkhi, prk); 508 rk = aes_schedule_192_smear(prkhi, prk);
509 prkhi = aes_schedule_192_smearhi(rk); 509 prkhi = aes_schedule_192_smearhi(rk);
510 } 510 }
511 break; 511 break;
512 } 512 }
513 case 14: { 513 case 14: {
514 uint8x16_t pprk; /* previous previous round key */ 514 uint8x16_t pprk; /* previous previous round key */
515 515
516 prk = rk; 516 prk = rk;
517 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt); 517 rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
518 for (;;) { 518 for (;;) {
519 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 519 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
520 storeroundkey(rk32, mrk); 520 storeroundkey(rk32, mrk);
521 rk32 -= 4; 521 rk32 -= 4;
522 pprk = rk; 522 pprk = rk;
523 523
524 /* high round */ 524 /* high round */
525 rk = prk = aes_schedule_round(rk, prk, &rcon_rot); 525 rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
526 if ((nrounds -= 2) == 0) 526 if ((nrounds -= 2) == 0)
527 break; 527 break;
528 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]); 528 mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
529 storeroundkey(rk32, mrk); 529 storeroundkey(rk32, mrk);
530 rk32 -= 4; 530 rk32 -= 4;
531 531
532 /* low round */ 532 /* low round */
533 rk = vreinterpretq_u8_u32( 533 rk = vreinterpretq_u8_u32(
534 vdupq_n_u32( 534 vdupq_n_u32(
535 vgetq_lane_u32(vreinterpretq_u32_u8(rk), 535 vgetq_lane_u32(vreinterpretq_u32_u8(rk),
536 3))); 536 3)));
537 rk = aes_schedule_low_round(rk, pprk); 537 rk = aes_schedule_low_round(rk, pprk);
538 } 538 }
539 break; 539 break;
540 } 540 }
541 default: 541 default:
542 panic("invalid number of AES rounds: %u", nrounds); 542 panic("invalid number of AES rounds: %u", nrounds);
543 } 543 }
544 storeroundkey(rk32, aes_schedule_mangle_last_dec(rk)); 544 storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
545} 545}
546 546
547#ifdef __aarch64__ 547#ifdef __aarch64__
548 548
549/* 549/*
550 * GCC does a lousy job of compiling NEON intrinsics for arm32, so we 550 * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
551 * do the performance-critical parts -- encryption and decryption -- in 551 * do the performance-critical parts -- encryption and decryption -- in
552 * hand-written assembly on arm32. 552 * hand-written assembly on arm32.
553 */ 553 */
554 554
555uint8x16_t 555uint8x16_t
556aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds) 556aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
557{ 557{
558 const uint32_t *rk32 = enc->aese_aes.aes_rk; 558 const uint32_t *rk32 = enc->aese_aes.aes_rk;
559 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 559 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
560 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 560 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
561 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; 561 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
562 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; 562 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
563 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; 563 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
564 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; 564 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
565 uint8x16_t io, jo; 565 uint8x16_t io, jo;
566 unsigned rmod4 = 0; 566 unsigned rmod4 = 0;
567 567
568 x = aes_schedule_transform(x, ipt); 568 x = aes_schedule_transform(x, ipt);
569 x ^= loadroundkey(rk32); 569 x ^= loadroundkey(rk32);
570 for (;;) { 570 for (;;) {
571 uint8x16_t A, A2, A2_B, A2_B_D; 571 uint8x16_t A, A2, A2_B, A2_B_D;
572 572
573 subbytes(&io, &jo, x, inv_, inva_); 573 subbytes(&io, &jo, x, inv_, inva_);
574 574
575 rk32 += 4; 575 rk32 += 4;
576 rmod4 = (rmod4 + 1) % 4; 576 rmod4 = (rmod4 + 1) % 4;
577 if (--nrounds == 0) 577 if (--nrounds == 0)
578 break; 578 break;
579 579
580 A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); 580 A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
581 A ^= loadroundkey(rk32); 581 A ^= loadroundkey(rk32);
582 A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); 582 A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
583 A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); 583 A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
584 A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); 584 A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
585 x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); 585 x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
586 } 586 }
587 x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); 587 x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
588 x ^= loadroundkey(rk32); 588 x ^= loadroundkey(rk32);
589 return vqtbl1q_u8(x, sr[rmod4]); 589 return vqtbl1q_u8(x, sr[rmod4]);
590} 590}
591 591
 592uint8x16x2_t
 593aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
 594{
 595 const uint32_t *rk32 = enc->aese_aes.aes_rk;
 596 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 597 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 598 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
 599 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
 600 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
 601 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
 602 uint8x16_t x0 = x.val[0], x1 = x.val[1];
 603 uint8x16_t io0, jo0, io1, jo1;
 604 unsigned rmod4 = 0;
 605
 606 x0 = aes_schedule_transform(x0, ipt);
 607 x1 = aes_schedule_transform(x1, ipt);
 608 x0 ^= loadroundkey(rk32);
 609 x1 ^= loadroundkey(rk32);
 610 for (;;) {
 611 uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
 612 uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
 613
 614 subbytes(&io0, &jo0, x0, inv_, inva_);
 615 subbytes(&io1, &jo1, x1, inv_, inva_);
 616
 617 rk32 += 4;
 618 rmod4 = (rmod4 + 1) % 4;
 619 if (--nrounds == 0)
 620 break;
 621
 622 A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
 623 A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
 624 A_0 ^= loadroundkey(rk32);
 625 A_1 ^= loadroundkey(rk32);
 626 A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
 627 A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
 628 A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
 629 A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
 630 A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
 631 A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
 632 x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
 633 x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
 634 }
 635 x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
 636 x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
 637 x0 ^= loadroundkey(rk32);
 638 x1 ^= loadroundkey(rk32);
 639 return (uint8x16x2_t) { .val = {
 640 [0] = vqtbl1q_u8(x0, sr[rmod4]),
 641 [1] = vqtbl1q_u8(x1, sr[rmod4]),
 642 } };
 643}
 644
592uint8x16_t 645uint8x16_t
593aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) 646aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
594{ 647{
595 const uint32_t *rk32 = dec->aesd_aes.aes_rk; 648 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
596 unsigned i = 3 & ~(nrounds - 1); 649 unsigned i = 3 & ~(nrounds - 1);
597 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 650 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
598 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 651 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
599 uint8x16_t io, jo, mc; 652 uint8x16_t io, jo, mc;
600 653
601 x = aes_schedule_transform(x, dipt); 654 x = aes_schedule_transform(x, dipt);
602 x ^= loadroundkey(rk32); 655 x ^= loadroundkey(rk32);
603 rk32 += 4; 656 rk32 += 4;
604 657
605 mc = mc_forward[3]; 658 mc = mc_forward[3];
606 for (;;) { 659 for (;;) {
607 subbytes(&io, &jo, x, inv_, inva_); 660 subbytes(&io, &jo, x, inv_, inva_);
608 if (--nrounds == 0) 661 if (--nrounds == 0)
609 break; 662 break;
610 663
611 x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo); 664 x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
612 x ^= loadroundkey(rk32); 665 x ^= loadroundkey(rk32);
613 rk32 += 4; /* next round key */ 666 rk32 += 4; /* next round key */
614 667
615 x = vqtbl1q_u8(x, mc); 668 x = vqtbl1q_u8(x, mc);
616 x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo); 669 x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
617 670
618 x = vqtbl1q_u8(x, mc); 671 x = vqtbl1q_u8(x, mc);
619 x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); 672 x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
620 673
621 x = vqtbl1q_u8(x, mc); 674 x = vqtbl1q_u8(x, mc);
622 x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); 675 x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
623 676
624 mc = vextq_u8(mc, mc, 12); 677 mc = vextq_u8(mc, mc, 12);
625 } 678 }
626 x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); 679 x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
627 x ^= loadroundkey(rk32); 680 x ^= loadroundkey(rk32);
628 return vqtbl1q_u8(x, sr[i]); 681 return vqtbl1q_u8(x, sr[i]);
629} 682}
630 683
 684uint8x16x2_t
 685aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
 686{
 687 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
 688 unsigned i = 3 & ~(nrounds - 1);
 689 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 690 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 691 uint8x16_t x0 = x.val[0], x1 = x.val[1];
 692 uint8x16_t io0, jo0, io1, jo1, mc;
 693
 694 x0 = aes_schedule_transform(x0, dipt);
 695 x1 = aes_schedule_transform(x1, dipt);
 696 x0 ^= loadroundkey(rk32);
 697 x1 ^= loadroundkey(rk32);
 698 rk32 += 4;
 699
 700 mc = mc_forward[3];
 701 for (;;) {
 702 subbytes(&io0, &jo0, x0, inv_, inva_);
 703 subbytes(&io1, &jo1, x1, inv_, inva_);
 704 if (--nrounds == 0)
 705 break;
 706
 707 x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
 708 x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
 709 x0 ^= loadroundkey(rk32);
 710 x1 ^= loadroundkey(rk32);
 711 rk32 += 4; /* next round key */
 712
 713 x0 = vqtbl1q_u8(x0, mc);
 714 x1 = vqtbl1q_u8(x1, mc);
 715 x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
 716 x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
 717
 718 x0 = vqtbl1q_u8(x0, mc);
 719 x1 = vqtbl1q_u8(x1, mc);
 720 x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
 721 x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
 722
 723 x0 = vqtbl1q_u8(x0, mc);
 724 x1 = vqtbl1q_u8(x1, mc);
 725 x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
 726 x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
 727
 728 mc = vextq_u8(mc, mc, 12);
 729 }
 730 x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
 731 x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
 732 x0 ^= loadroundkey(rk32);
 733 x1 ^= loadroundkey(rk32);
 734 return (uint8x16x2_t) { .val = {
 735 [0] = vqtbl1q_u8(x0, sr[i]),
 736 [1] = vqtbl1q_u8(x1, sr[i]),
 737 } };
 738}
 739
631#endif 740#endif

cvs diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_subr.c (switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/25 22:36:06 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/28 20:11:09 1.4
@@ -1,309 +1,382 @@ @@ -1,309 +1,382 @@
1/* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ 1/* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/cdefs.h> 29#include <sys/cdefs.h>
30__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $"); 30__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
31 31
32#include <sys/endian.h> 32#include <sys/endian.h>
33 33
34#ifdef _KERNEL 34#ifdef _KERNEL
35#include <sys/systm.h> 35#include <sys/systm.h>
36#include <lib/libkern/libkern.h> 36#include <lib/libkern/libkern.h>
37#else 37#else
38#include <assert.h> 38#include <assert.h>
39#include <inttypes.h> 39#include <inttypes.h>
40#include <stdio.h> 40#include <stdio.h>
41#define KASSERT assert 41#define KASSERT assert
42#endif 42#endif
43 43
44#include <crypto/aes/arch/arm/aes_neon.h> 44#include <crypto/aes/arch/arm/aes_neon.h>
45 45
46#include "aes_neon_impl.h" 46#include "aes_neon_impl.h"
47 47
48static inline uint8x16_t 48static inline uint8x16_t
49loadblock(const void *in) 49loadblock(const void *in)
50{ 50{
51 return vld1q_u8(in); 51 return vld1q_u8(in);
52} 52}
53 53
54static inline void 54static inline void
55storeblock(void *out, uint8x16_t block) 55storeblock(void *out, uint8x16_t block)
56{ 56{
57 vst1q_u8(out, block); 57 vst1q_u8(out, block);
58} 58}
59 59
60void 60void
61aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16], 61aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16],
62 uint8_t out[static 16], uint32_t nrounds) 62 uint8_t out[static 16], uint32_t nrounds)
63{ 63{
64 uint8x16_t block; 64 uint8x16_t block;
65 65
66 block = loadblock(in); 66 block = loadblock(in);
67 block = aes_neon_enc1(enc, block, nrounds); 67 block = aes_neon_enc1(enc, block, nrounds);
68 storeblock(out, block); 68 storeblock(out, block);
69} 69}
70 70
71void 71void
72aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16], 72aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16],
73 uint8_t out[static 16], uint32_t nrounds) 73 uint8_t out[static 16], uint32_t nrounds)
74{ 74{
75 uint8x16_t block; 75 uint8x16_t block;
76 76
77 block = loadblock(in); 77 block = loadblock(in);
78 block = aes_neon_dec1(dec, block, nrounds); 78 block = aes_neon_dec1(dec, block, nrounds);
79 storeblock(out, block); 79 storeblock(out, block);
80} 80}
81 81
82void 82void
83aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16], 83aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
84 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], 84 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
85 uint32_t nrounds) 85 uint32_t nrounds)
86{ 86{
87 uint8x16_t cv; 87 uint8x16_t cv;
88 88
89 KASSERT(nbytes); 89 KASSERT(nbytes);
90 90
91 cv = loadblock(iv); 91 cv = loadblock(iv);
92 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 92 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
93 cv ^= loadblock(in); 93 cv ^= loadblock(in);
94 cv = aes_neon_enc1(enc, cv, nrounds); 94 cv = aes_neon_enc1(enc, cv, nrounds);
95 storeblock(out, cv); 95 storeblock(out, cv);
96 } 96 }
97 storeblock(iv, cv); 97 storeblock(iv, cv);
98} 98}
99 99
100void 100void
101aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], 101aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
102 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], 102 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
103 uint32_t nrounds) 103 uint32_t nrounds)
104{ 104{
105 uint8x16_t iv0, cv, b; 105 uint8x16_t iv0, cv, b;
106 106
107 KASSERT(nbytes); 107 KASSERT(nbytes);
108 KASSERT(nbytes % 16 == 0); 108 KASSERT(nbytes % 16 == 0);
109 109
110 iv0 = loadblock(iv); 110 iv0 = loadblock(iv);
111 cv = loadblock(in + nbytes - 16); 111 cv = loadblock(in + nbytes - 16);
112 storeblock(iv, cv); 112 storeblock(iv, cv);
113 113
114 for (;;) { 114 if (nbytes % 32) {
 115 KASSERT(nbytes % 32 == 16);
115 b = aes_neon_dec1(dec, cv, nrounds); 116 b = aes_neon_dec1(dec, cv, nrounds);
116 if ((nbytes -= 16) == 0) 117 if ((nbytes -= 16) == 0)
117 break; 118 goto out;
 119 cv = loadblock(in + nbytes - 16);
 120 storeblock(out + nbytes, cv ^ b);
 121 }
 122
 123 for (;;) {
 124 uint8x16x2_t b2;
 125
 126 KASSERT(nbytes >= 32);
 127
 128 b2.val[1] = cv;
 129 b2.val[0] = cv = loadblock(in + nbytes - 32);
 130 b2 = aes_neon_dec2(dec, b2, nrounds);
 131 storeblock(out + nbytes - 16, cv ^ b2.val[1]);
 132 if ((nbytes -= 32) == 0) {
 133 b = b2.val[0];
 134 goto out;
 135 }
118 cv = loadblock(in + nbytes - 16); 136 cv = loadblock(in + nbytes - 16);
119 storeblock(out + nbytes, b ^ cv); 137 storeblock(out + nbytes, cv ^ b2.val[0]);
120 } 138 }
121 storeblock(out, b ^ iv0); 139
 140out: storeblock(out, b ^ iv0);
122} 141}
123 142
124static inline uint8x16_t 143static inline uint8x16_t
125aes_neon_xts_update(uint8x16_t t8) 144aes_neon_xts_update(uint8x16_t t8)
126{ 145{
127 const int32x4_t zero = vdupq_n_s32(0); 146 const int32x4_t zero = vdupq_n_s32(0);
128 const int32x4_t carry = {0x87, 1, 1, 1}; 147 const int32x4_t carry = {0x87, 1, 1, 1};
129 int32x4_t t, t_; 148 int32x4_t t, t_;
130 uint32x4_t mask; 149 uint32x4_t mask;
131 150
132 t = vreinterpretq_s32_u8(t8); 151 t = vreinterpretq_s32_u8(t8);
133 mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ 152 mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */
134 mask = vextq_u32(mask, mask, 3); /* rotate quarters */ 153 mask = vextq_u32(mask, mask, 3); /* rotate quarters */
135 t_ = vsliq_n_s32(zero, t, 1); /* shift */ 154 t_ = vsliq_n_s32(zero, t, 1); /* shift */
136 t_ ^= carry & mask; 155 t_ ^= carry & mask;
137 156
138 return vreinterpretq_u8_s32(t_); 157 return vreinterpretq_u8_s32(t_);
139} 158}
140 159
141static int 160static int
142aes_neon_xts_update_selftest(void) 161aes_neon_xts_update_selftest(void)
143{ 162{
144 static const struct { 163 static const struct {
145 uint32_t in[4], out[4]; 164 uint32_t in[4], out[4];
146 } cases[] = { 165 } cases[] = {
147 [0] = { {1}, {2} }, 166 [0] = { {1}, {2} },
148 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, 167 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
149 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, 168 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
150 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, 169 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
151 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, 170 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
152 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, 171 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
153 }; 172 };
154 unsigned i; 173 unsigned i;
155 uint32_t t[4]; 174 uint32_t t[4];
156 int result = 0; 175 int result = 0;
157 176
158 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { 177 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
159 t[0] = cases[i].in[0]; 178 t[0] = cases[i].in[0];
160 t[1] = cases[i].in[1]; 179 t[1] = cases[i].in[1];
161 t[2] = cases[i].in[2]; 180 t[2] = cases[i].in[2];
162 t[3] = cases[i].in[3]; 181 t[3] = cases[i].in[3];
163 storeblock(t, aes_neon_xts_update(loadblock(t))); 182 storeblock(t, aes_neon_xts_update(loadblock(t)));
164 if (t[0] != cases[i].out[0] || 183 if (t[0] != cases[i].out[0] ||
165 t[1] != cases[i].out[1] || 184 t[1] != cases[i].out[1] ||
166 t[2] != cases[i].out[2] || 185 t[2] != cases[i].out[2] ||
167 t[3] != cases[i].out[3]) { 186 t[3] != cases[i].out[3]) {
168 printf("%s %u:" 187 printf("%s %u:"
169 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", 188 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
170 __func__, i, t[0], t[1], t[2], t[3]); 189 __func__, i, t[0], t[1], t[2], t[3]);
171 result = -1; 190 result = -1;
172 } 191 }
173 } 192 }
174 193
175 return result; 194 return result;
176} 195}
177 196
178void 197void
179aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], 198aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
180 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], 199 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
181 uint32_t nrounds) 200 uint32_t nrounds)
182{ 201{
183 uint8x16_t t, b; 202 uint8x16_t t, b;
184 203
185 KASSERT(nbytes); 204 KASSERT(nbytes);
186 KASSERT(nbytes % 16 == 0); 205 KASSERT(nbytes % 16 == 0);
187 206
188 t = loadblock(tweak); 207 t = loadblock(tweak);
189 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 208 if (nbytes % 32) {
 209 KASSERT(nbytes % 32 == 16);
190 b = t ^ loadblock(in); 210 b = t ^ loadblock(in);
191 b = aes_neon_enc1(enc, b, nrounds); 211 b = aes_neon_enc1(enc, b, nrounds);
192 storeblock(out, t ^ b); 212 storeblock(out, t ^ b);
193 t = aes_neon_xts_update(t); 213 t = aes_neon_xts_update(t);
 214 nbytes -= 16;
 215 in += 16;
 216 out += 16;
 217 }
 218 for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 219 uint8x16_t t1;
 220 uint8x16x2_t b2;
 221
 222 t1 = aes_neon_xts_update(t);
 223 b2.val[0] = t ^ loadblock(in);
 224 b2.val[1] = t1 ^ loadblock(in + 16);
 225 b2 = aes_neon_enc2(enc, b2, nrounds);
 226 storeblock(out, b2.val[0] ^ t);
 227 storeblock(out + 16, b2.val[1] ^ t1);
 228
 229 t = aes_neon_xts_update(t1);
194 } 230 }
195 storeblock(tweak, t); 231 storeblock(tweak, t);
196} 232}
197 233
198void 234void
199aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], 235aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
200 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], 236 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
201 uint32_t nrounds) 237 uint32_t nrounds)
202{ 238{
203 uint8x16_t t, b; 239 uint8x16_t t, b;
204 240
205 KASSERT(nbytes); 241 KASSERT(nbytes);
206 KASSERT(nbytes % 16 == 0); 242 KASSERT(nbytes % 16 == 0);
207 243
208 t = loadblock(tweak); 244 t = loadblock(tweak);
209 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 245 if (nbytes % 32) {
 246 KASSERT(nbytes % 32 == 16);
210 b = t ^ loadblock(in); 247 b = t ^ loadblock(in);
211 b = aes_neon_dec1(dec, b, nrounds); 248 b = aes_neon_dec1(dec, b, nrounds);
212 storeblock(out, t ^ b); 249 storeblock(out, t ^ b);
213 t = aes_neon_xts_update(t); 250 t = aes_neon_xts_update(t);
 251 nbytes -= 16;
 252 in += 16;
 253 out += 16;
 254 }
 255 for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 256 uint8x16_t t1;
 257 uint8x16x2_t b2;
 258
 259 t1 = aes_neon_xts_update(t);
 260 b2.val[0] = t ^ loadblock(in);
 261 b2.val[1] = t1 ^ loadblock(in + 16);
 262 b2 = aes_neon_dec2(dec, b2, nrounds);
 263 storeblock(out, b2.val[0] ^ t);
 264 storeblock(out + 16, b2.val[1] ^ t1);
 265
 266 t = aes_neon_xts_update(t1);
214 } 267 }
215 storeblock(tweak, t); 268 storeblock(tweak, t);
216} 269}
217 270
218void 271void
219aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], 272aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
220 size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) 273 size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
221{ 274{
222 uint8x16_t auth; 275 uint8x16_t auth;
223 276
224 KASSERT(nbytes); 277 KASSERT(nbytes);
225 KASSERT(nbytes % 16 == 0); 278 KASSERT(nbytes % 16 == 0);
226 279
227 auth = loadblock(auth0); 280 auth = loadblock(auth0);
228 for (; nbytes; nbytes -= 16, in += 16) 281 for (; nbytes; nbytes -= 16, in += 16)
229 auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds); 282 auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds);
230 storeblock(auth0, auth); 283 storeblock(auth0, auth);
231} 284}
232 285
233/* 286/*
234 * XXX On aarch64, we have enough registers that we should be able to 287 * XXX On aarch64, we have enough registers that we should be able to
235 * pipeline two simultaneous vpaes computations in an `aes_neon_enc2' 288 * pipeline two simultaneous vpaes computations in an `aes_neon_enc2'
236 * function, which should substantially improve CCM throughput. 289 * function, which should substantially improve CCM throughput.
237 */ 290 */
238 291
239#if _BYTE_ORDER == _LITTLE_ENDIAN 292#if _BYTE_ORDER == _LITTLE_ENDIAN
240#define vbetoh32q_u8 vrev32q_u8 293#define vbetoh32q_u8 vrev32q_u8
241#define vhtobe32q_u8 vrev32q_u8 294#define vhtobe32q_u8 vrev32q_u8
242#elif _BYTE_ORDER == _BIG_ENDIAN 295#elif _BYTE_ORDER == _BIG_ENDIAN
243#define vbetoh32q_u8(x) (x) 296#define vbetoh32q_u8(x) (x)
244#define vhtobe32q_u8(x) (x) 297#define vhtobe32q_u8(x) (x)
245#else 298#else
246#error what kind of endian are you anyway 299#error what kind of endian are you anyway
247#endif 300#endif
248 301
249void 302void
250aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], 303aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
251 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], 304 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
252 uint32_t nrounds) 305 uint32_t nrounds)
253{ 306{
254 const uint32x4_t ctr32_inc = {0, 0, 0, 1}; 307 const uint32x4_t ctr32_inc = {0, 0, 0, 1};
255 uint8x16_t auth, ptxt, ctr_be; 308 uint8x16_t auth, ptxt, ctr_be;
256 uint32x4_t ctr; 309 uint32x4_t ctr;
257 310
258 KASSERT(nbytes); 311 KASSERT(nbytes);
259 KASSERT(nbytes % 16 == 0); 312 KASSERT(nbytes % 16 == 0);
260 313
261 auth = loadblock(authctr); 314 auth = loadblock(authctr);
262 ctr_be = loadblock(authctr + 16); 315 ctr_be = loadblock(authctr + 16);
263 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); 316 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
264 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 317 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 318 uint8x16x2_t b2;
265 ptxt = loadblock(in); 319 ptxt = loadblock(in);
266 auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); 
267 ctr = vaddq_u32(ctr, ctr32_inc); 320 ctr = vaddq_u32(ctr, ctr32_inc);
268 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); 321 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
269 storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds)); 322
 323 b2.val[0] = auth ^ ptxt;
 324 b2.val[1] = ctr_be;
 325 b2 = aes_neon_enc2(enc, b2, nrounds);
 326 auth = b2.val[0];
 327 storeblock(out, ptxt ^ b2.val[1]);
270 } 328 }
271 storeblock(authctr, auth); 329 storeblock(authctr, auth);
272 storeblock(authctr + 16, ctr_be); 330 storeblock(authctr + 16, ctr_be);
273} 331}
274 332
275void 333void
276aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], 334aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
277 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], 335 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
278 uint32_t nrounds) 336 uint32_t nrounds)
279{ 337{
280 const uint32x4_t ctr32_inc = {0, 0, 0, 1}; 338 const uint32x4_t ctr32_inc = {0, 0, 0, 1};
281 uint8x16_t auth, ctr_be, ptxt; 339 uint8x16_t auth, ctr_be, ptxt, pad;
282 uint32x4_t ctr; 340 uint32x4_t ctr;
283 341
284 KASSERT(nbytes); 342 KASSERT(nbytes);
285 KASSERT(nbytes % 16 == 0); 343 KASSERT(nbytes % 16 == 0);
286 344
287 auth = loadblock(authctr); 
288 ctr_be = loadblock(authctr + 16); 345 ctr_be = loadblock(authctr + 16);
289 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); 346 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
290 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 347 ctr = vaddq_u32(ctr, ctr32_inc);
 348 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
 349 pad = aes_neon_enc1(enc, ctr_be, nrounds);
 350 auth = loadblock(authctr);
 351 for (;; in += 16, out += 16) {
 352 uint8x16x2_t b2;
 353
 354 ptxt = loadblock(in) ^ pad;
 355 auth ^= ptxt;
 356 storeblock(out, ptxt);
 357
 358 if ((nbytes -= 16) == 0)
 359 break;
 360
291 ctr = vaddq_u32(ctr, ctr32_inc); 361 ctr = vaddq_u32(ctr, ctr32_inc);
292 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); 362 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
293 ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds); 363 b2.val[0] = auth;
294 storeblock(out, ptxt); 364 b2.val[1] = ctr_be;
295 auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); 365 b2 = aes_neon_enc2(enc, b2, nrounds);
 366 auth = b2.val[0];
 367 pad = b2.val[1];
296 } 368 }
 369 auth = aes_neon_enc1(enc, auth, nrounds);
297 storeblock(authctr, auth); 370 storeblock(authctr, auth);
298 storeblock(authctr + 16, ctr_be); 371 storeblock(authctr + 16, ctr_be);
299} 372}
300 373
301int 374int
302aes_neon_selftest(void) 375aes_neon_selftest(void)
303{ 376{
304 377
305 if (aes_neon_xts_update_selftest()) 378 if (aes_neon_xts_update_selftest())
306 return -1; 379 return -1;
307 380
308 return 0; 381 return 0;
309} 382}

cvs diff -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h (switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/06/29 23:56:31 1.1
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/07/28 20:11:09 1.2
@@ -1,42 +1,71 @@ @@ -1,42 +1,71 @@
1/* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ 1/* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H 29#ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H
30#define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H 30#define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H
31 31
32#include <sys/types.h> 32#include <sys/types.h>
33 33
34#include "arm_neon.h" 34#include "arm_neon.h"
35 35
36#include <crypto/aes/aes.h> 36#include <crypto/aes/aes.h>
37#include <crypto/aes/arch/arm/aes_neon.h> 37#include <crypto/aes/arch/arm/aes_neon.h>
38 38
39uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); 39uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned);
40uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); 40uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned);
41 41
 42#ifdef __aarch64__
 43
 44uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned);
 45uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned);
 46
 47#else
 48
 49static inline uint8x16x2_t
 50aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds)
 51{
 52
 53 return (uint8x16x2_t) { .val = {
 54 [0] = aes_neon_enc1(enc, b2.val[0], nrounds),
 55 [1] = aes_neon_enc1(enc, b2.val[1], nrounds),
 56 } };
 57}
 58
 59static inline uint8x16x2_t
 60aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds)
 61{
 62
 63 return (uint8x16x2_t) { .val = {
 64 [0] = aes_neon_dec1(dec, b2.val[0], nrounds),
 65 [1] = aes_neon_dec1(dec, b2.val[1], nrounds),
 66 } };
 67}
 68
 69#endif
 70
42#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ 71#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */

cvs diff -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h (switch to unified diff)

--- src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/25 22:43:01 1.6
+++ src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/28 20:11:09 1.7
@@ -1,534 +1,536 @@ @@ -1,534 +1,536 @@
1/* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */ 1/* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
15 * 15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H 29#ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
30#define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H 30#define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
31 31
32#if defined(__GNUC__) && !defined(__clang__) 32#if defined(__GNUC__) && !defined(__clang__)
33 33
34#define _INTRINSATTR \ 34#define _INTRINSATTR \
35 __extension__ \ 35 __extension__ \
36 __attribute__((__always_inline__, __gnu_inline__, __artificial__)) 36 __attribute__((__always_inline__, __gnu_inline__, __artificial__))
37 37
38#ifdef __aarch64__ 38#ifdef __aarch64__
39typedef __Int32x4_t int32x4_t; 39typedef __Int32x4_t int32x4_t;
40typedef __Int64x2_t int64x2_t; 40typedef __Int64x2_t int64x2_t;
41typedef __Int8x16_t int8x16_t; 41typedef __Int8x16_t int8x16_t;
42typedef __Uint32x4_t uint32x4_t; 42typedef __Uint32x4_t uint32x4_t;
43typedef __Uint64x2_t uint64x2_t; 43typedef __Uint64x2_t uint64x2_t;
44typedef __Uint8x16_t uint8x16_t; 44typedef __Uint8x16_t uint8x16_t;
 45typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
45#else 46#else
46typedef __simd128_int32_t int32x4_t; 47typedef __simd128_int32_t int32x4_t;
47typedef __simd128_int64_t int64x2_t; 48typedef __simd128_int64_t int64x2_t;
48typedef __simd128_int8_t int8x16_t; 49typedef __simd128_int8_t int8x16_t;
49typedef __simd128_uint32_t uint32x4_t; 50typedef __simd128_uint32_t uint32x4_t;
50typedef __simd128_uint64_t uint64x2_t; 51typedef __simd128_uint64_t uint64x2_t;
51typedef __simd128_uint8_t uint8x16_t; 52typedef __simd128_uint8_t uint8x16_t;
52 53
53typedef __simd64_int8_t int8x8_t; 54typedef __simd64_int8_t int8x8_t;
54typedef __simd64_uint8_t uint8x8_t; 55typedef __simd64_uint8_t uint8x8_t;
55typedef __builtin_neon_udi uint64x1_t; 56typedef __builtin_neon_udi uint64x1_t;
56typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 57typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 58typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
57#endif 59#endif
58 60
59#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 61#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
60#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 62#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
61#else 63#else
62#define __neon_lane_index(__v, __i) __i 64#define __neon_lane_index(__v, __i) __i
63#endif 65#endif
64 66
65#elif defined(__clang__) 67#elif defined(__clang__)
66 68
67#define _INTRINSATTR \ 69#define _INTRINSATTR \
68 __attribute__((__always_inline__, __nodebug__)) 70 __attribute__((__always_inline__, __nodebug__))
69 71
70typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; 72typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
71typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t; 73typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
72typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t; 74typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
73typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t; 75typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
74typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; 76typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
75typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; 77typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
76 78
77typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t; 79typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
78typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 80typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
79 81
80#ifdef __LITTLE_ENDIAN__ 82#ifdef __LITTLE_ENDIAN__
81#define __neon_lane_index(__v, __i) __i 83#define __neon_lane_index(__v, __i) __i
82#else 84#else
83#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 85#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
84#endif 86#endif
85 87
86#else 88#else
87 89
88#error Teach me how to neon in your compile! 90#error Teach me how to neon in your compile!
89 91
90#endif 92#endif
91 93
92_INTRINSATTR 94_INTRINSATTR
93static __inline uint32x4_t 95static __inline uint32x4_t
94vaddq_u32(uint32x4_t __v0, uint32x4_t __v1) 96vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
95{ 97{
96 return __v0 + __v1; 98 return __v0 + __v1;
97} 99}
98 100
99_INTRINSATTR 101_INTRINSATTR
100static __inline uint32x4_t 102static __inline uint32x4_t
101vcltq_s32(int32x4_t __v0, int32x4_t __v1) 103vcltq_s32(int32x4_t __v0, int32x4_t __v1)
102{ 104{
103 return (uint32x4_t)(__v0 < __v1); 105 return (uint32x4_t)(__v0 < __v1);
104} 106}
105 107
106_INTRINSATTR 108_INTRINSATTR
107static __inline int32x4_t 109static __inline int32x4_t
108vdupq_n_s32(int32_t __x) 110vdupq_n_s32(int32_t __x)
109{ 111{
110 return (int32x4_t) { __x, __x, __x, __x }; 112 return (int32x4_t) { __x, __x, __x, __x };
111} 113}
112 114
113_INTRINSATTR 115_INTRINSATTR
114static __inline uint32x4_t 116static __inline uint32x4_t
115vdupq_n_u32(uint32_t __x) 117vdupq_n_u32(uint32_t __x)
116{ 118{
117 return (uint32x4_t) { __x, __x, __x, __x }; 119 return (uint32x4_t) { __x, __x, __x, __x };
118} 120}
119 121
120_INTRINSATTR 122_INTRINSATTR
121static __inline uint8x16_t 123static __inline uint8x16_t
122vdupq_n_u8(uint8_t __x) 124vdupq_n_u8(uint8_t __x)
123{ 125{
124 return (uint8x16_t) { 126 return (uint8x16_t) {
125 __x, __x, __x, __x, __x, __x, __x, __x, 127 __x, __x, __x, __x, __x, __x, __x, __x,
126 __x, __x, __x, __x, __x, __x, __x, __x, 128 __x, __x, __x, __x, __x, __x, __x, __x,
127 }; 129 };
128} 130}
129 131
130#if defined(__GNUC__) && !defined(__clang__) 132#if defined(__GNUC__) && !defined(__clang__)
131_INTRINSATTR 133_INTRINSATTR
132static __inline uint32x4_t 134static __inline uint32x4_t
133vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i) 135vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
134{ 136{
135#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 137#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
136 return __builtin_shuffle(__hi, __lo, 138 return __builtin_shuffle(__hi, __lo,
137 (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i }); 139 (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
138#else 140#else
139 return __builtin_shuffle(__lo, __hi, 141 return __builtin_shuffle(__lo, __hi,
140 (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 }); 142 (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
141#endif 143#endif
142} 144}
143#elif defined(__clang__) 145#elif defined(__clang__)
144#ifdef __LITTLE_ENDIAN__ 146#ifdef __LITTLE_ENDIAN__
145#define vextq_u32(__lo, __hi, __i) \ 147#define vextq_u32(__lo, __hi, __i) \
146 (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 148 (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
147 (int8x16_t)(__hi), (__i), 50) 149 (int8x16_t)(__hi), (__i), 50)
148#else 150#else
149#define vextq_u32(__lo, __hi, __i) ( \ 151#define vextq_u32(__lo, __hi, __i) ( \
150{ \ 152{ \
151 uint32x4_t __tlo = (__lo); \ 153 uint32x4_t __tlo = (__lo); \
152 uint32x4_t __thi = (__hi); \ 154 uint32x4_t __thi = (__hi); \
153 uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \ 155 uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \
154 uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \ 156 uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \
155 uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 157 uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
156 (int8x16_t)__hi_r, __i, 50); \ 158 (int8x16_t)__hi_r, __i, 50); \
157 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 159 __builtin_shufflevector(__r, __r, 3,2,1,0); \
158}) 160})
159#endif /* __LITTLE_ENDIAN__ */ 161#endif /* __LITTLE_ENDIAN__ */
160#endif 162#endif
161 163
162#if defined(__GNUC__) && !defined(__clang__) 164#if defined(__GNUC__) && !defined(__clang__)
163_INTRINSATTR 165_INTRINSATTR
164static __inline uint8x16_t 166static __inline uint8x16_t
165vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i) 167vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
166{ 168{
167#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 169#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
168 return __builtin_shuffle(__hi, __lo, 170 return __builtin_shuffle(__hi, __lo,
169 (uint8x16_t) { 171 (uint8x16_t) {
170 16 - __i, 17 - __i, 18 - __i, 19 - __i, 172 16 - __i, 17 - __i, 18 - __i, 19 - __i,
171 20 - __i, 21 - __i, 22 - __i, 23 - __i, 173 20 - __i, 21 - __i, 22 - __i, 23 - __i,
172 24 - __i, 25 - __i, 26 - __i, 27 - __i, 174 24 - __i, 25 - __i, 26 - __i, 27 - __i,
173 28 - __i, 29 - __i, 30 - __i, 31 - __i, 175 28 - __i, 29 - __i, 30 - __i, 31 - __i,
174 }); 176 });
175#else 177#else
176 return __builtin_shuffle(__lo, __hi, 178 return __builtin_shuffle(__lo, __hi,
177 (uint8x16_t) { 179 (uint8x16_t) {
178 __i + 0, __i + 1, __i + 2, __i + 3, 180 __i + 0, __i + 1, __i + 2, __i + 3,
179 __i + 4, __i + 5, __i + 6, __i + 7, 181 __i + 4, __i + 5, __i + 6, __i + 7,
180 __i + 8, __i + 9, __i + 10, __i + 11, 182 __i + 8, __i + 9, __i + 10, __i + 11,
181 __i + 12, __i + 13, __i + 14, __i + 15, 183 __i + 12, __i + 13, __i + 14, __i + 15,
182 }); 184 });
183#endif 185#endif
184} 186}
185#elif defined(__clang__) 187#elif defined(__clang__)
186#ifdef __LITTLE_ENDIAN__ 188#ifdef __LITTLE_ENDIAN__
187#define vextq_u8(__lo, __hi, __i) \ 189#define vextq_u8(__lo, __hi, __i) \
188 (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \ 190 (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
189 (int8x16_t)(__hi), (__i), 48) 191 (int8x16_t)(__hi), (__i), 48)
190#else 192#else
191#define vextq_u8(__lo, __hi, __i) ( \ 193#define vextq_u8(__lo, __hi, __i) ( \
192{ \ 194{ \
193 uint8x16_t __tlo = (__lo); \ 195 uint8x16_t __tlo = (__lo); \
194 uint8x16_t __thi = (__hi); \ 196 uint8x16_t __thi = (__hi); \
195 uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \ 197 uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \
196 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 198 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
197 uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \ 199 uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \
198 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 200 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
199 uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \ 201 uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
200 (int8x16_t)__hi_r, (__i), 48); \ 202 (int8x16_t)__hi_r, (__i), 48); \
201 return __builtin_shufflevector(__r, __r, \ 203 return __builtin_shufflevector(__r, __r, \
202 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \ 204 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
203}) 205})
204#endif /* __LITTLE_ENDIAN */ 206#endif /* __LITTLE_ENDIAN */
205#endif 207#endif
206 208
207#if defined(__GNUC__) && !defined(__clang__) 209#if defined(__GNUC__) && !defined(__clang__)
208_INTRINSATTR 210_INTRINSATTR
209static __inline uint32_t 211static __inline uint32_t
210vgetq_lane_u32(uint32x4_t __v, uint8_t __i) 212vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
211{ 213{
212#ifdef __aarch64__ 214#ifdef __aarch64__
213 return __v[__i]; 215 return __v[__i];
214#else 216#else
215 return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i); 217 return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
216#endif 218#endif
217} 219}
218#elif defined(__clang__) 220#elif defined(__clang__)
219#define vgetq_lane_u32(__v, __i) \ 221#define vgetq_lane_u32(__v, __i) \
220 (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \ 222 (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \
221 __neon_lane_index(__v, __i)) 223 __neon_lane_index(__v, __i))
222#endif 224#endif
223 225
224_INTRINSATTR 226_INTRINSATTR
225static __inline uint32x4_t 227static __inline uint32x4_t
226vld1q_u32(const uint32_t *__p32) 228vld1q_u32(const uint32_t *__p32)
227{ 229{
228#if defined(__GNUC__) && !defined(__clang__) 230#if defined(__GNUC__) && !defined(__clang__)
229#ifdef __aarch64__ 231#ifdef __aarch64__
230 const __builtin_aarch64_simd_si *__p = 232 const __builtin_aarch64_simd_si *__p =
231 (const __builtin_aarch64_simd_si *)__p32; 233 (const __builtin_aarch64_simd_si *)__p32;
232 234
233 return (uint32x4_t)__builtin_aarch64_ld1v4si(__p); 235 return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
234#else 236#else
235 const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32; 237 const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
236 238
237 return (uint32x4_t)__builtin_neon_vld1v4si(__p); 239 return (uint32x4_t)__builtin_neon_vld1v4si(__p);
238#endif 240#endif
239#elif defined(__clang__) 241#elif defined(__clang__)
240 uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50); 242 uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
241#ifndef __LITTLE_ENDIAN__ 243#ifndef __LITTLE_ENDIAN__
242 __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 244 __v = __builtin_shufflevector(__v, __v, 3,2,1,0);
243#endif 245#endif
244 return __v; 246 return __v;
245#endif 247#endif
246} 248}
247 249
248_INTRINSATTR 250_INTRINSATTR
249static __inline uint8x16_t 251static __inline uint8x16_t
250vld1q_u8(const uint8_t *__p8) 252vld1q_u8(const uint8_t *__p8)
251{ 253{
252#if defined(__GNUC__) && !defined(__clang__) 254#if defined(__GNUC__) && !defined(__clang__)
253#ifdef __aarch64__ 255#ifdef __aarch64__
254 const __builtin_aarch64_simd_qi *__p = 256 const __builtin_aarch64_simd_qi *__p =
255 (const __builtin_aarch64_simd_qi *)__p8; 257 (const __builtin_aarch64_simd_qi *)__p8;
256 258
257 return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p); 259 return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
258#else 260#else
259 const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8; 261 const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
260 262
261 return (uint8x16_t)__builtin_neon_vld1v16qi(__p); 263 return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
262#endif 264#endif
263#elif defined(__clang__) 265#elif defined(__clang__)
264 uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48); 266 uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
265#ifndef __LITTLE_ENDIAN__ 267#ifndef __LITTLE_ENDIAN__
266 __v = __builtin_shufflevector(__v, __v, 268 __v = __builtin_shufflevector(__v, __v,
267 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 269 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
268#endif 270#endif
269 return __v; 271 return __v;
270#endif 272#endif
271} 273}
272 274
273_INTRINSATTR 275_INTRINSATTR
274static __inline uint8x16_t 276static __inline uint8x16_t
275vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx) 277vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
276{ 278{
277#if defined(__GNUC__) && !defined(__clang__) 279#if defined(__GNUC__) && !defined(__clang__)
278#ifdef __aarch64__ 280#ifdef __aarch64__
279 uint8x16_t __res; 281 uint8x16_t __res;
280 __asm__("tbl %0.16b, {%1.16b}, %2.16b" 282 __asm__("tbl %0.16b, {%1.16b}, %2.16b"
281 : "=w"(__res) : "w"(__tab), "w"(__idx)); 283 : "=w"(__res) : "w"(__tab), "w"(__idx));
282 return __res; 284 return __res;
283#else 285#else
284 /* 286 /*
285 * No native ARMv7 NEON instruction for this, so do it via two 287 * No native ARMv7 NEON instruction for this, so do it via two
286 * half-width TBLs instead (vtbl2_u8 equivalent). 288 * half-width TBLs instead (vtbl2_u8 equivalent).
287 */ 289 */
288 uint64x2_t __tab64 = (uint64x2_t)__tab; 290 uint64x2_t __tab64 = (uint64x2_t)__tab;
289 uint8x8_t __tablo = (uint8x8_t)__tab64[0]; 291 uint8x8_t __tablo = (uint8x8_t)__tab64[0];
290 uint8x8_t __tabhi = (uint8x8_t)__tab64[1]; 292 uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
291 uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } }; 293 uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
292 union { 294 union {
293 uint8x8x2_t __u8x8x2; 295 uint8x8x2_t __u8x8x2;
294 __builtin_neon_ti __ti; 296 __builtin_neon_ti __ti;
295 } __u = { __tab8x8x2 }; 297 } __u = { __tab8x8x2 };
296 uint64x2_t __idx64, __out64; 298 uint64x2_t __idx64, __out64;
297 int8x8_t __idxlo, __idxhi, __outlo, __outhi; 299 int8x8_t __idxlo, __idxhi, __outlo, __outhi;
298 300
299 __idx64 = (uint64x2_t)__idx; 301 __idx64 = (uint64x2_t)__idx;
300 __idxlo = (int8x8_t)__idx64[0]; 302 __idxlo = (int8x8_t)__idx64[0];
301 __idxhi = (int8x8_t)__idx64[1]; 303 __idxhi = (int8x8_t)__idx64[1];
302 __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo); 304 __outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
303 __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi); 305 __outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
304 __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi }; 306 __out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
305 307
306 return (uint8x16_t)__out64; 308 return (uint8x16_t)__out64;
307#endif 309#endif
308#elif defined(__clang__) 310#elif defined(__clang__)
309#ifdef __LITTLE_ENDIAN__ 311#ifdef __LITTLE_ENDIAN__
310 return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab, 312 return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
311 (int8x16_t)__idx, 48); 313 (int8x16_t)__idx, 48);
312#else 314#else
313 uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo, 315 uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
314 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 316 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
315 uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi, 317 uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
316 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 318 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
317 uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, 319 uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
318 (int8x16_t)__idx, __i, 48); 320 (int8x16_t)__idx, __i, 48);
319 return __builtin_shufflevector(__r, __r, 321 return __builtin_shufflevector(__r, __r,
320 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 322 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
321#endif 323#endif
322#endif 324#endif
323} 325}
324 326
325_INTRINSATTR 327_INTRINSATTR
326static __inline int32x4_t 328static __inline int32x4_t
327vreinterpretq_s32_u8(uint8x16_t __v) 329vreinterpretq_s32_u8(uint8x16_t __v)
328{ 330{
329 return (int32x4_t)__v; 331 return (int32x4_t)__v;
330} 332}
331 333
332_INTRINSATTR 334_INTRINSATTR
333static __inline uint32x4_t 335static __inline uint32x4_t
334vreinterpretq_u32_u8(uint8x16_t __v) 336vreinterpretq_u32_u8(uint8x16_t __v)
335{ 337{
336 return (uint32x4_t)__v; 338 return (uint32x4_t)__v;
337} 339}
338 340
339_INTRINSATTR 341_INTRINSATTR
340static __inline uint64x2_t 342static __inline uint64x2_t
341vreinterpretq_u64_u8(uint8x16_t __v) 343vreinterpretq_u64_u8(uint8x16_t __v)
342{ 344{
343 return (uint64x2_t)__v; 345 return (uint64x2_t)__v;
344} 346}
345 347
346_INTRINSATTR 348_INTRINSATTR
347static __inline uint8x16_t 349static __inline uint8x16_t
348vreinterpretq_u8_s32(int32x4_t __v) 350vreinterpretq_u8_s32(int32x4_t __v)
349{ 351{
350 return (uint8x16_t)__v; 352 return (uint8x16_t)__v;
351} 353}
352 354
353_INTRINSATTR 355_INTRINSATTR
354static __inline uint8x16_t 356static __inline uint8x16_t
355vreinterpretq_u8_u32(uint32x4_t __v) 357vreinterpretq_u8_u32(uint32x4_t __v)
356{ 358{
357 return (uint8x16_t)__v; 359 return (uint8x16_t)__v;
358} 360}
359 361
360_INTRINSATTR 362_INTRINSATTR
361static __inline uint8x16_t 363static __inline uint8x16_t
362vreinterpretq_u8_u64(uint64x2_t __v) 364vreinterpretq_u8_u64(uint64x2_t __v)
363{ 365{
364 return (uint8x16_t)__v; 366 return (uint8x16_t)__v;
365} 367}
366 368
367_INTRINSATTR 369_INTRINSATTR
368static __inline uint8x16_t 370static __inline uint8x16_t
369vrev32q_u8(uint8x16_t __v) 371vrev32q_u8(uint8x16_t __v)
370{ 372{
371#if defined(__GNUC__) && !defined(__clang__) 373#if defined(__GNUC__) && !defined(__clang__)
372 return __builtin_shuffle(__v, 374 return __builtin_shuffle(__v,
373 (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 }); 375 (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
374#elif defined(__clang__) 376#elif defined(__clang__)
375 return __builtin_shufflevector(__v, 377 return __builtin_shufflevector(__v,
376 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); 378 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
377#endif 379#endif
378} 380}
379 381
380#if defined(__GNUC__) && !defined(__clang__) 382#if defined(__GNUC__) && !defined(__clang__)
381_INTRINSATTR 383_INTRINSATTR
382static __inline uint32x4_t 384static __inline uint32x4_t
383vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i) 385vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
384{ 386{
385 __v[__neon_lane_index(__v, __i)] = __x; 387 __v[__neon_lane_index(__v, __i)] = __x;
386 return __v; 388 return __v;
387} 389}
388#elif defined(__clang__) 390#elif defined(__clang__)
389#define vsetq_lane_u32(__x, __v, __i) \ 391#define vsetq_lane_u32(__x, __v, __i) \
390 (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \ 392 (uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \
391 __neon_lane_index(__v, __i)) 393 __neon_lane_index(__v, __i))
392#endif 394#endif
393 395
394#if defined(__GNUC__) && !defined(__clang__) 396#if defined(__GNUC__) && !defined(__clang__)
395_INTRINSATTR 397_INTRINSATTR
396static __inline uint64x2_t 398static __inline uint64x2_t
397vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i) 399vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
398{ 400{
399 __v[__neon_lane_index(__v, __i)] = __x; 401 __v[__neon_lane_index(__v, __i)] = __x;
400 return __v; 402 return __v;
401} 403}
402#elif defined(__clang__) 404#elif defined(__clang__)
403#define vsetq_lane_u64(__x, __v, __i) \ 405#define vsetq_lane_u64(__x, __v, __i) \
404 (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \ 406 (uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v), \
405 __neon_lane_index(__v, __i)); 407 __neon_lane_index(__v, __i));
406#endif 408#endif
407 409
408#if defined(__GNUC__) && !defined(__clang__) 410#if defined(__GNUC__) && !defined(__clang__)
409_INTRINSATTR 411_INTRINSATTR
410static __inline uint32x4_t 412static __inline uint32x4_t
411vshlq_n_u32(uint32x4_t __v, uint8_t __bits) 413vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
412{ 414{
413#ifdef __aarch64__ 415#ifdef __aarch64__
414 return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits); 416 return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
415#else 417#else
416 return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits); 418 return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
417#endif 419#endif
418} 420}
419#elif defined(__clang__) 421#elif defined(__clang__)
420#define vshlq_n_u32(__v, __bits) \ 422#define vshlq_n_u32(__v, __bits) \
421 (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50) 423 (uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
422#endif 424#endif
423 425
424#if defined(__GNUC__) && !defined(__clang__) 426#if defined(__GNUC__) && !defined(__clang__)
425_INTRINSATTR 427_INTRINSATTR
426static __inline uint32x4_t 428static __inline uint32x4_t
427vshrq_n_u32(uint32x4_t __v, uint8_t __bits) 429vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
428{ 430{
429#ifdef __aarch64__ 431#ifdef __aarch64__
430 return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits); 432 return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
431#else 433#else
432 return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits); 434 return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
433#endif 435#endif
434} 436}
435#elif defined(__clang__) 437#elif defined(__clang__)
436#define vshrq_n_u8(__v, __bits) \ 438#define vshrq_n_u8(__v, __bits) \
437 (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50) 439 (uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
438#endif 440#endif
439 441
440#if defined(__GNUC__) && !defined(__clang__) 442#if defined(__GNUC__) && !defined(__clang__)
441_INTRINSATTR 443_INTRINSATTR
442static __inline uint8x16_t 444static __inline uint8x16_t
443vshrq_n_u8(uint8x16_t __v, uint8_t __bits) 445vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
444{ 446{
445#ifdef __aarch64__ 447#ifdef __aarch64__
446 return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits); 448 return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
447#else 449#else
448 return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits); 450 return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
449#endif 451#endif
450} 452}
451#elif defined(__clang__) 453#elif defined(__clang__)
452#define vshrq_n_u8(__v, __bits) \ 454#define vshrq_n_u8(__v, __bits) \
453 (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48) 455 (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
454#endif 456#endif
455 457
456#if defined(__GNUC__) && !defined(__clang__) 458#if defined(__GNUC__) && !defined(__clang__)
457_INTRINSATTR 459_INTRINSATTR
458static __inline int32x4_t 460static __inline int32x4_t
459vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits) 461vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
460{ 462{
461#ifdef __aarch64__ 463#ifdef __aarch64__
462 return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits); 464 return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
463#else 465#else
464 return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits); 466 return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
465#endif 467#endif
466} 468}
467#elif defined(__clang__) 469#elif defined(__clang__)
468#ifdef __LITTLE_ENDIAN__ 470#ifdef __LITTLE_ENDIAN__
469#define vsliq_n_s32(__vins, __vsh, __bits) \ 471#define vsliq_n_s32(__vins, __vsh, __bits) \
470 (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \ 472 (int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \
471 (int32x4_t)(__vsh), (__bits), 34) 473 (int32x4_t)(__vsh), (__bits), 34)
472#else 474#else
473#define vsliq_n_s32(__vins, __vsh, __bits) ( \ 475#define vsliq_n_s32(__vins, __vsh, __bits) ( \
474{ \ 476{ \
475 int32x4_t __tvins = (__vins); \ 477 int32x4_t __tvins = (__vins); \
476 int32x4_t __tvsh = (__vsh); \ 478 int32x4_t __tvsh = (__vsh); \
477 uint8_t __tbits = (__bits); \ 479 uint8_t __tbits = (__bits); \
478 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 480 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
479 3,2,1,0); \ 481 3,2,1,0); \
480 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 482 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
481 3,2,1,0); \ 483 3,2,1,0); \
482 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ 484 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \
483 34); \ 485 34); \
484 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 486 __builtin_shufflevector(__r, __r, 3,2,1,0); \
485}) 487})
486#endif /* __LITTLE_ENDIAN__ */ 488#endif /* __LITTLE_ENDIAN__ */
487#endif 489#endif
488 490
489_INTRINSATTR 491_INTRINSATTR
490static __inline void 492static __inline void
491vst1q_u32(uint32_t *__p32, uint32x4_t __v) 493vst1q_u32(uint32_t *__p32, uint32x4_t __v)
492{ 494{
493#if defined(__GNUC__) && !defined(__clang__) 495#if defined(__GNUC__) && !defined(__clang__)
494#ifdef __aarch64__ 496#ifdef __aarch64__
495 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; 497 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
496 498
497 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); 499 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
498#else 500#else
499 __builtin_neon_si *__p = (__builtin_neon_si *)__p32; 501 __builtin_neon_si *__p = (__builtin_neon_si *)__p32;
500 502
501 __builtin_neon_vst1v4si(__p, (int32x4_t)__v); 503 __builtin_neon_vst1v4si(__p, (int32x4_t)__v);
502#endif 504#endif
503#elif defined(__clang__) 505#elif defined(__clang__)
504#ifndef __LITTLE_ENDIAN__ 506#ifndef __LITTLE_ENDIAN__
505 __v = __builtin_shufflevector(__v, __v, 3,2,1,0); 507 __v = __builtin_shufflevector(__v, __v, 3,2,1,0);
506#endif 508#endif
507 __builtin_neon_vst1q_v(__p32, __v, 50); 509 __builtin_neon_vst1q_v(__p32, __v, 50);
508#endif 510#endif
509} 511}
510 512
511_INTRINSATTR 513_INTRINSATTR
512static __inline void 514static __inline void
513vst1q_u8(uint8_t *__p8, uint8x16_t __v) 515vst1q_u8(uint8_t *__p8, uint8x16_t __v)
514{ 516{
515#if defined(__GNUC__) && !defined(__clang__) 517#if defined(__GNUC__) && !defined(__clang__)
516#ifdef __aarch64__ 518#ifdef __aarch64__
517 __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8; 519 __builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
518 520
519 __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v); 521 __builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
520#else 522#else
521 __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8; 523 __builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
522 524
523 __builtin_neon_vst1v16qi(__p, (int8x16_t)__v); 525 __builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
524#endif 526#endif
525#elif defined(__clang__) 527#elif defined(__clang__)
526#ifndef __LITTLE_ENDIAN__ 528#ifndef __LITTLE_ENDIAN__
527 __v = __builtin_shufflevector(__v, __v, 529 __v = __builtin_shufflevector(__v, __v,
528 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); 530 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
529#endif 531#endif
530 __builtin_neon_vst1q_v(__p8, __v, 48); 532 __builtin_neon_vst1q_v(__p8, __v, 48);
531#endif 533#endif
532} 534}
533 535
534#endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */ 536#endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */