@@ -1,631 +1,740 @@
-/*	$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
  * software, at <https://crypto.stanford.edu/vpaes/>, described in
+ *
  *	Mike Hamburg, `Accelerating AES with Vector Permute
  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
  *	Springer LNCS 5747, pp. 18-32.
+ *
  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
  */
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 #include <sys/types.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #else
 #include <err.h>
 #define	panic(fmt, args...)		err(1, fmt, ##args)
 #endif
 #include "aes_neon_impl.h"
 #ifdef __aarch64__
 #define	__aarch64_used
 #else
 #define	__aarch64_used	__unused
 #endif
 static const uint8x16_t
 mc_forward[4] = {
 	{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
 x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
 	{0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
 x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
 	{0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
 x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
 	{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
 x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
 },
 mc_backward[4] __aarch64_used = {
 	{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
 x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
 	{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
 x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
 	{0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
 x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
 	{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
 x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
 },
 ipt[2] __aarch64_used = {
 	{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
 x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
 	{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
 x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
 },
 opt[2] = {
 	{0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
 x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
 	{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
 xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
 },
 dipt[2] __aarch64_used = {
 	{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
 x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
 	{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
 x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
 },
 sb1[2] __aarch64_used = {
 	{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
 x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
 	{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
 xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
 },
 sb2[2] __aarch64_used = {
 	{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
 xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
 	{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
 x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
 },
 sbo[2] __aarch64_used = {
 	{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
 x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
 	{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
 xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
 },
 dsb9[2] __aarch64_used = {
 	{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
 xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
 	{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
 x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
 },
 dsbd[2] __aarch64_used = {
 	{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
 x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
 	{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
 xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
 },
 dsbb[2] __aarch64_used = {
 	{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
 x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
 	{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
 x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
 },
 dsbe[2] __aarch64_used = {
 	{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
 xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
 	{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
 x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
 },
 dsbo[2] __aarch64_used = {
 	{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
 x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
 	{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
 x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
 },
 dks1[2] = {
 	{0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
 xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
 	{0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
 xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
 },
 dks2[2] = {
 	{0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
 x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
 	{0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
 xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
 },
 dks3[2] = {
 	{0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
 xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
 	{0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
 x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
 },
 dks4[2] = {
 	{0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
 x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
 	{0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
 x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F},
 },
 deskew[2] = {
 	{0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
 x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D},
 	{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
 x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
 },
 sr[4] __aarch64_used = {
 	{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
 	{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
 x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B},
 	{0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
 x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07},
 	{0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
 x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03},
 },
 rcon =	{0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
 x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70},
 s63 =	{0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
 x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B},
 of =	{0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
 x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
 inv =	{0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
 x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04},
 inva =	{0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
 x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03};
 static inline uint8x16_t
 loadroundkey(const void *rkp)
+{
 	return vld1q_u8(rkp);
+}
 static inline void
 storeroundkey(void *rkp, uint8x16_t rk)
+{
 	vst1q_u8(rkp, rk);
+}
 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
 static inline void
 bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
+{
 	*lo = of & x;
 	*hi = of & vshrq_n_u8(x, 4);
+}
 /*
  * t is a pair of maps respectively from low and high nybbles to bytes.
  * Apply t the nybbles, and add the results in GF(2).
  */
 static uint8x16_t
 aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
+{
 	uint8x16_t lo, hi;
 	bytes2nybbles(&lo, &hi, x);
 	return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
+}
 static inline void
 subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
     uint8x16_t inva_)
+{
 	uint8x16_t k, i, ak, j;
 	bytes2nybbles(&k, &i, x);
 	ak = vqtbl1q_u8(inva_, k);
 	j = i ^ k;
 	*io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
 	*jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
+}
 static uint8x16_t
 aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
+{
 	uint8x16_t io, jo;
 	/* smear prk */
 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
 	prk ^= s63;
 	/* subbytes */
 	subbytes(&io, &jo, rk, inv, inva);
 	rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
 	/* add in smeared stuff */
 	return rk ^ prk;
+}
 static uint8x16_t
 aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
+{
 	uint32x4_t rk32;
 	/* extract rcon from rcon_rot */
 	prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
 	*rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
 	/* rotate */
 	rk32 = vreinterpretq_u32_u8(rk);
 	rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
 	rk = vreinterpretq_u8_u32(rk32);
 	rk = vextq_u8(rk, rk, 1);
 	return aes_schedule_low_round(rk, prk);
+}
 static uint8x16_t
 aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
+{
 	uint8x16_t y = vdupq_n_u8(0);
 	x ^= s63;
 	x = vqtbl1q_u8(x, mc_forward[0]);
 	y ^= x;
 	x = vqtbl1q_u8(x, mc_forward[0]);
 	y ^= x;
 	x = vqtbl1q_u8(x, mc_forward[0]);
 	y ^= x;
 	return vqtbl1q_u8(y, sr_i);
+}
 static uint8x16_t
 aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
+{
 	return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
+}
 static uint8x16_t
 aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
+{
 	uint8x16_t y = vdupq_n_u8(0);
 	x = aes_schedule_transform(x, dks1);
 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
 	x = aes_schedule_transform(x, dks2);
 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
 	x = aes_schedule_transform(x, dks3);
 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
 	x = aes_schedule_transform(x, dks4);
 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
 	return vqtbl1q_u8(y, sr_i);
+}
 static uint8x16_t
 aes_schedule_mangle_last_dec(uint8x16_t x)
+{
 	return aes_schedule_transform(x ^ s63, deskew);
+}
 static uint8x16_t
 aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
+{
 	uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
 	uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
 	uint32x4_t rk32;
 	rk32 = prkhi32;
 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
 	    vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
 );
 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
 	    vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
 );
 	return vreinterpretq_u8_u32(rk32);
+}
 static uint8x16_t
 aes_schedule_192_smearhi(uint8x16_t rk)
+{
 	uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
 	rk64 = vsetq_lane_u64(0, rk64, 0);
 	return vreinterpretq_u8_u64(rk64);
+}
 void
 aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
+{
 	uint32_t *rk32 = enc->aese_aes.aes_rk;
 	uint8x16_t mrk;		/* mangled round key */
 	uint8x16_t rk;		/* round key */
 	uint8x16_t prk;		/* previous round key */
 	uint8x16_t rcon_rot = rcon;
 	uint64_t i = 3;
 	/* input transform */
 	rk = aes_schedule_transform(vld1q_u8(key), ipt);
 	storeroundkey(rk32, rk);
 	rk32 += 4;
 	switch (nrounds) {
 	case 10:
 		for (;;) {
 			rk = aes_schedule_round(rk, rk, &rcon_rot);
 			if (--nrounds == 0)
 				break;
 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 += 4;
+		}
 		break;
 	case 12: {
 		uint8x16_t prkhi;	/* high half of previous round key */
 		prk = rk;
 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
 		prkhi = aes_schedule_192_smearhi(rk);
 		for (;;) {
 			prk = aes_schedule_round(rk, prk, &rcon_rot);
 			rk = vextq_u8(prkhi, prk, 8);
 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 += 4;
 			rk = aes_schedule_192_smear(prkhi, prk);
 			prkhi = aes_schedule_192_smearhi(rk);
 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 += 4;
 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
 			if ((nrounds -= 3) == 0)
 				break;
 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 += 4;
 			rk = aes_schedule_192_smear(prkhi, prk);
 			prkhi = aes_schedule_192_smearhi(rk);
+		}
 		break;
+	}
 	case 14: {
 		uint8x16_t pprk;	/* previous previous round key */
 		prk = rk;
 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
 		for (;;) {
 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 += 4;
 			pprk = rk;
 			/* high round */
 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
 			if ((nrounds -= 2) == 0)
 				break;
 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 += 4;
 			/* low round */
 			rk = vreinterpretq_u8_u32(
 				vdupq_n_u32(
 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
 )));
 			rk = aes_schedule_low_round(rk, pprk);
+		}
 		break;
+	}
 	default:
 		panic("invalid number of AES rounds: %u", nrounds);
+	}
 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
+}
 void
 aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
+{
 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
 	uint8x16_t mrk;		/* mangled round key */
 	uint8x16_t ork;		/* original round key */
 	uint8x16_t rk;		/* round key */
 	uint8x16_t prk;		/* previous round key */
 	uint8x16_t rcon_rot = rcon;
 	unsigned i = nrounds == 12 ? 0 : 2;
 	ork = vld1q_u8(key);
 	/* input transform */
 	rk = aes_schedule_transform(ork, ipt);
 	/* go from end */
 	rk32 += 4*nrounds;
 	storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
 	rk32 -= 4;
 	i ^= 3;
 	switch (nrounds) {
 	case 10:
 		for (;;) {
 			rk = aes_schedule_round(rk, rk, &rcon_rot);
 			if (--nrounds == 0)
 				break;
 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 -= 4;
+		}
 		break;
 	case 12: {
 		uint8x16_t prkhi;	/* high half of previous round key */
 		prk = rk;
 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
 		prkhi = aes_schedule_192_smearhi(rk);
 		for (;;) {
 			prk = aes_schedule_round(rk, prk, &rcon_rot);
 			rk = vextq_u8(prkhi, prk, 8);
 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 -= 4;
 			rk = aes_schedule_192_smear(prkhi, prk);
 			prkhi = aes_schedule_192_smearhi(rk);
 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 -= 4;
 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
 			if ((nrounds -= 3) == 0)
 				break;
 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 -= 4;
 			rk = aes_schedule_192_smear(prkhi, prk);
 			prkhi = aes_schedule_192_smearhi(rk);
+		}
 		break;
+	}
 	case 14: {
 		uint8x16_t pprk;	/* previous previous round key */
 		prk = rk;
 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
 		for (;;) {
 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 -= 4;
 			pprk = rk;
 			/* high round */
 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
 			if ((nrounds -= 2) == 0)
 				break;
 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
 			storeroundkey(rk32, mrk);
 			rk32 -= 4;
 			/* low round */
 			rk = vreinterpretq_u8_u32(
 				vdupq_n_u32(
 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
 )));
 			rk = aes_schedule_low_round(rk, pprk);
+		}
 		break;
+	}
 	default:
 		panic("invalid number of AES rounds: %u", nrounds);
+	}
 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
+}
 #ifdef __aarch64__
 /*
  * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
  * do the performance-critical parts -- encryption and decryption -- in
  * hand-written assembly on arm32.
  */
 uint8x16_t
 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
 	uint8x16_t io, jo;
 	unsigned rmod4 = 0;
 	x = aes_schedule_transform(x, ipt);
 	x ^= loadroundkey(rk32);
 	for (;;) {
 		uint8x16_t A, A2, A2_B, A2_B_D;
 		subbytes(&io, &jo, x, inv_, inva_);
 		rk32 += 4;
 		rmod4 = (rmod4 + 1) % 4;
 		if (--nrounds == 0)
 			break;
 		A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
 		A ^= loadroundkey(rk32);
 		A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
 		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
 		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
 		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
+	}
 	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
 	x ^= loadroundkey(rk32);
 	return vqtbl1q_u8(x, sr[rmod4]);
+}
 uint8x16x2_t
 aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
 	uint8x16_t io0, jo0, io1, jo1;
 	unsigned rmod4 = 0;
 	x0 = aes_schedule_transform(x0, ipt);
 	x1 = aes_schedule_transform(x1, ipt);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	for (;;) {
 		uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
 		uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
 		subbytes(&io0, &jo0, x0, inv_, inva_);
 		subbytes(&io1, &jo1, x1, inv_, inva_);
 		rk32 += 4;
 		rmod4 = (rmod4 + 1) % 4;
 		if (--nrounds == 0)
 			break;
 		A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
 		A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
 		A_0 ^= loadroundkey(rk32);
 		A_1 ^= loadroundkey(rk32);
 		A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
 		A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
 		A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
 		A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
 		A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
 		A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
 		x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
 		x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
+	}
 	x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
 	x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	return (uint8x16x2_t) { .val = {
 		[0] = vqtbl1q_u8(x0, sr[rmod4]),
 		[1] = vqtbl1q_u8(x1, sr[rmod4]),
 	} };
+}
 uint8x16_t
 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
 	unsigned i = 3 & ~(nrounds - 1);
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t io, jo, mc;
 	x = aes_schedule_transform(x, dipt);
 	x ^= loadroundkey(rk32);
 	rk32 += 4;
 	mc = mc_forward[3];
 	for (;;) {
 		subbytes(&io, &jo, x, inv_, inva_);
 		if (--nrounds == 0)
 			break;
 		x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
 		x ^= loadroundkey(rk32);
 		rk32 += 4;				/* next round key */
 		x = vqtbl1q_u8(x, mc);
 		x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
 		x = vqtbl1q_u8(x, mc);
 		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
 		x = vqtbl1q_u8(x, mc);
 		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
 		mc = vextq_u8(mc, mc, 12);
+	}
 	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
 	x ^= loadroundkey(rk32);
 	return vqtbl1q_u8(x, sr[i]);
+}
 uint8x16x2_t
 aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
 	unsigned i = 3 & ~(nrounds - 1);
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
 	uint8x16_t io0, jo0, io1, jo1, mc;
 	x0 = aes_schedule_transform(x0, dipt);
 	x1 = aes_schedule_transform(x1, dipt);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	rk32 += 4;
 	mc = mc_forward[3];
 	for (;;) {
 		subbytes(&io0, &jo0, x0, inv_, inva_);
 		subbytes(&io1, &jo1, x1, inv_, inva_);
 		if (--nrounds == 0)
 			break;
 		x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
 		x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
 		x0 ^= loadroundkey(rk32);
 		x1 ^= loadroundkey(rk32);
 		rk32 += 4;				/* next round key */
 		x0 = vqtbl1q_u8(x0, mc);
 		x1 = vqtbl1q_u8(x1, mc);
 		x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
 		x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
 		x0 = vqtbl1q_u8(x0, mc);
 		x1 = vqtbl1q_u8(x1, mc);
 		x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
 		x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
 		x0 = vqtbl1q_u8(x0, mc);
 		x1 = vqtbl1q_u8(x1, mc);
 		x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
 		x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
 		mc = vextq_u8(mc, mc, 12);
+	}
 	x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
 	x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	return (uint8x16x2_t) { .val = {
 		[0] = vqtbl1q_u8(x0, sr[i]),
 		[1] = vqtbl1q_u8(x1, sr[i]),
 	} };
+}
 #endif

 @@ -1,309 +1,382 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 #include <sys/endian.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <lib/libkern/libkern.h>
 #else
 #include <assert.h>
 #include <inttypes.h>
 #include <stdio.h>
 #define	KASSERT			assert
 #endif
 #include <crypto/aes/arch/arm/aes_neon.h>
 #include "aes_neon_impl.h"
 static inline uint8x16_t
 loadblock(const void *in)
+{
 	return vld1q_u8(in);
+}
 static inline void
 storeblock(void *out, uint8x16_t block)
+{
 	vst1q_u8(out, block);
+}
 void
 aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], uint32_t nrounds)
+{
 	uint8x16_t block;
 	block = loadblock(in);
 	block = aes_neon_enc1(enc, block, nrounds);
 	storeblock(out, block);
+}
 void
 aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], uint32_t nrounds)
+{
 	uint8x16_t block;
 	block = loadblock(in);
 	block = aes_neon_dec1(dec, block, nrounds);
 	storeblock(out, block);
+}
 void
 aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t cv;
 	KASSERT(nbytes);
 	cv = loadblock(iv);
 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 		cv ^= loadblock(in);
 		cv = aes_neon_enc1(enc, cv, nrounds);
 		storeblock(out, cv);
+	}
 	storeblock(iv, cv);
+}
 void
 aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t iv0, cv, b;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	iv0 = loadblock(iv);
 	cv = loadblock(in + nbytes - 16);
 	storeblock(iv, cv);
-	for (;;) {
+	if (nbytes % 32) {
 		KASSERT(nbytes % 32 == 16);
 		b = aes_neon_dec1(dec, cv, nrounds);
 		if ((nbytes -= 16) == 0)
-			break;
+			goto out;
 		cv = loadblock(in + nbytes - 16);
 		storeblock(out + nbytes, cv ^ b);
+	}
 	for (;;) {
 		uint8x16x2_t b2;
 		KASSERT(nbytes >= 32);
 		b2.val[1] = cv;
 		b2.val[0] = cv = loadblock(in + nbytes - 32);
 		b2 = aes_neon_dec2(dec, b2, nrounds);
 		storeblock(out + nbytes - 16, cv ^ b2.val[1]);
 		if ((nbytes -= 32) == 0) {
 			b = b2.val[0];
 			goto out;
+		}
 		cv = loadblock(in + nbytes - 16);
-		storeblock(out + nbytes, b ^ cv);
+		storeblock(out + nbytes, cv ^ b2.val[0]);
+	}
 	storeblock(out, b ^ iv0);
 out:	storeblock(out, b ^ iv0);
+}
 static inline uint8x16_t
 aes_neon_xts_update(uint8x16_t t8)
+{
 	const int32x4_t zero = vdupq_n_s32(0);
 	const int32x4_t carry = {0x87, 1, 1, 1};
 	int32x4_t t, t_;
 	uint32x4_t mask;
 	t = vreinterpretq_s32_u8(t8);
 	mask = vcltq_s32(t, zero);		/* -1 if high bit set else 0 */
 	mask = vextq_u32(mask, mask, 3);	/* rotate quarters */
 	t_ = vsliq_n_s32(zero, t, 1);		/* shift */
 	t_ ^= carry & mask;
 	return vreinterpretq_u8_s32(t_);
+}
 static int
 aes_neon_xts_update_selftest(void)
+{
 	static const struct {
 		uint32_t in[4], out[4];
 	} cases[] = {
 		[0] = { {1}, {2} },
 		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
 		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
 		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
 		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
 		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
 	};
 	unsigned i;
 	uint32_t t[4];
 	int result = 0;
 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
 		t[0] = cases[i].in[0];
 		t[1] = cases[i].in[1];
 		t[2] = cases[i].in[2];
 		t[3] = cases[i].in[3];
 		storeblock(t, aes_neon_xts_update(loadblock(t)));
 		if (t[0] != cases[i].out[0] ||
 		    t[1] != cases[i].out[1] ||
 		    t[2] != cases[i].out[2] ||
 		    t[3] != cases[i].out[3]) {
 			printf("%s %u:"
 			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
 			    __func__, i, t[0], t[1], t[2], t[3]);
 			result = -1;
+		}
+	}
 	return result;
+}
 void
 aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t t, b;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	t = loadblock(tweak);
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+	if (nbytes % 32) {
 		KASSERT(nbytes % 32 == 16);
 		b = t ^ loadblock(in);
 		b = aes_neon_enc1(enc, b, nrounds);
 		storeblock(out, t ^ b);
 		t = aes_neon_xts_update(t);
 		nbytes -= 16;
 		in += 16;
 		out += 16;
+	}
 	for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 		uint8x16_t t1;
 		uint8x16x2_t b2;
 		t1 = aes_neon_xts_update(t);
 		b2.val[0] = t ^ loadblock(in);
 		b2.val[1] = t1 ^ loadblock(in + 16);
 		b2 = aes_neon_enc2(enc, b2, nrounds);
 		storeblock(out, b2.val[0] ^ t);
 		storeblock(out + 16, b2.val[1] ^ t1);
 		t = aes_neon_xts_update(t1);
+	}
 	storeblock(tweak, t);
+}
 void
 aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t t, b;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	t = loadblock(tweak);
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+	if (nbytes % 32) {
 		KASSERT(nbytes % 32 == 16);
 		b = t ^ loadblock(in);
 		b = aes_neon_dec1(dec, b, nrounds);
 		storeblock(out, t ^ b);
 		t = aes_neon_xts_update(t);
 		nbytes -= 16;
 		in += 16;
 		out += 16;
+	}
 	for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 		uint8x16_t t1;
 		uint8x16x2_t b2;
 		t1 = aes_neon_xts_update(t);
 		b2.val[0] = t ^ loadblock(in);
 		b2.val[1] = t1 ^ loadblock(in + 16);
 		b2 = aes_neon_dec2(dec, b2, nrounds);
 		storeblock(out, b2.val[0] ^ t);
 		storeblock(out + 16, b2.val[1] ^ t1);
 		t = aes_neon_xts_update(t1);
+	}
 	storeblock(tweak, t);
+}
 void
 aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
     size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
+{
 	uint8x16_t auth;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	auth = loadblock(auth0);
 	for (; nbytes; nbytes -= 16, in += 16)
 		auth = aes_neon_enc1(enc, auth ^ loadblock(in), nrounds);
 	storeblock(auth0, auth);
+}
 /*
  * XXX On aarch64, we have enough registers that we should be able to
  * pipeline two simultaneous vpaes computations in an `aes_neon_enc2'
  * function, which should substantially improve CCM throughput.
  */
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 #define	vbetoh32q_u8	vrev32q_u8
 #define	vhtobe32q_u8	vrev32q_u8
 #elif _BYTE_ORDER == _BIG_ENDIAN
 #define	vbetoh32q_u8(x)	(x)
 #define	vhtobe32q_u8(x)	(x)
 #else
 #error what kind of endian are you anyway
 #endif
 void
 aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
     uint32_t nrounds)
+{
 	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
 	uint8x16_t auth, ptxt, ctr_be;
 	uint32x4_t ctr;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	auth = loadblock(authctr);
 	ctr_be = loadblock(authctr + 16);
 	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 		uint8x16x2_t b2;
 		ptxt = loadblock(in);
 		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
 		ctr = vaddq_u32(ctr, ctr32_inc);
 		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
 		storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds));
 		b2.val[0] = auth ^ ptxt;
 		b2.val[1] = ctr_be;
 		b2 = aes_neon_enc2(enc, b2, nrounds);
 		auth = b2.val[0];
 		storeblock(out, ptxt ^ b2.val[1]);
+	}
 	storeblock(authctr, auth);
 	storeblock(authctr + 16, ctr_be);
+}
 void
 aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
     uint32_t nrounds)
+{
 	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
-	uint8x16_t auth, ctr_be, ptxt;
+	uint8x16_t auth, ctr_be, ptxt, pad;
 	uint32x4_t ctr;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	auth = loadblock(authctr);
 	ctr_be = loadblock(authctr + 16);
 	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+	ctr = vaddq_u32(ctr, ctr32_inc);
 	ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
 	pad = aes_neon_enc1(enc, ctr_be, nrounds);
 	auth = loadblock(authctr);
 	for (;; in += 16, out += 16) {
 		uint8x16x2_t b2;
 		ptxt = loadblock(in) ^ pad;
 		auth ^= ptxt;
 		storeblock(out, ptxt);
 		if ((nbytes -= 16) == 0)
 			break;
 		ctr = vaddq_u32(ctr, ctr32_inc);
 		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
-		ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds);
+		b2.val[0] = auth;
-		storeblock(out, ptxt);
+		b2.val[1] = ctr_be;
-		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
+		b2 = aes_neon_enc2(enc, b2, nrounds);
 		auth = b2.val[0];
 		pad = b2.val[1];
+	}
 	auth = aes_neon_enc1(enc, auth, nrounds);
 	storeblock(authctr, auth);
 	storeblock(authctr + 16, ctr_be);
+}
 int
 aes_neon_selftest(void)
+{
 	if (aes_neon_xts_update_selftest())
 		return -1;
 	return 0;
+}

 @@ -1,534 +1,536 @@
-/*	$NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #ifndef	_SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
 #define	_SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
 #if defined(__GNUC__) && !defined(__clang__)
 #define	_INTRINSATTR							      \
 	__extension__							      \
 	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
 #ifdef __aarch64__
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
 typedef __Int8x16_t int8x16_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #else
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 typedef __simd128_int8_t int8x16_t;
 typedef __simd128_uint32_t uint32x4_t;
 typedef __simd128_uint64_t uint64x2_t;
 typedef __simd128_uint8_t uint8x16_t;
 typedef __simd64_int8_t int8x8_t;
 typedef __simd64_uint8_t uint8x8_t;
 typedef __builtin_neon_udi uint64x1_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #endif
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #else
 #define	__neon_lane_index(__v, __i)	__i
 #endif
 #elif defined(__clang__)
 #define	_INTRINSATTR							      \
 	__attribute__((__always_inline__, __nodebug__))
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 #ifdef __LITTLE_ENDIAN__
 #define	__neon_lane_index(__v, __i)	__i
 #else
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #endif
 #else
 #error Teach me how to neon in your compile!
 #endif
 _INTRINSATTR
 static __inline uint32x4_t
 vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
+{
 	return __v0 + __v1;
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vcltq_s32(int32x4_t __v0, int32x4_t __v1)
+{
 	return (uint32x4_t)(__v0 < __v1);
+}
 _INTRINSATTR
 static __inline int32x4_t
 vdupq_n_s32(int32_t __x)
+{
 	return (int32x4_t) { __x, __x, __x, __x };
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vdupq_n_u32(uint32_t __x)
+{
 	return (uint32x4_t) { __x, __x, __x, __x };
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vdupq_n_u8(uint8_t __x)
+{
 	return (uint8x16_t) {
 		__x, __x, __x, __x, __x, __x, __x, __x,
 		__x, __x, __x, __x, __x, __x, __x, __x,
 	};
+}
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
+{
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 	return __builtin_shuffle(__hi, __lo,
 	    (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
 #else
 	return __builtin_shuffle(__lo, __hi,
 	    (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vextq_u32(__lo, __hi, __i)					      \
 	(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
 	    (int8x16_t)(__hi), (__i), 50)
 #else
 #define	vextq_u32(__lo, __hi, __i) (					      \
 {									      \
 	uint32x4_t __tlo = (__lo);					      \
 	uint32x4_t __thi = (__hi);					      \
 	uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
 	uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
 	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
 	    (int8x16_t)__hi_r, __i, 50);				      \
 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN__ */
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint8x16_t
 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
+{
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 	return __builtin_shuffle(__hi, __lo,
 	    (uint8x16_t) {
 - __i, 17 - __i, 18 - __i, 19 - __i,
 - __i, 21 - __i, 22 - __i, 23 - __i,
 - __i, 25 - __i, 26 - __i, 27 - __i,
 - __i, 29 - __i, 30 - __i, 31 - __i,
 	});
 #else
 	return __builtin_shuffle(__lo, __hi,
 	    (uint8x16_t) {
 		__i +  0, __i +  1, __i +  2, __i +  3,
 		__i +  4, __i +  5, __i +  6, __i +  7,
 		__i +  8, __i +  9, __i + 10, __i + 11,
 		__i + 12, __i + 13, __i + 14, __i + 15,
 	});
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vextq_u8(__lo, __hi, __i)					      \
 	(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
 	    (int8x16_t)(__hi), (__i), 48)
 #else
 #define	vextq_u8(__lo, __hi, __i) (					      \
 {									      \
 	uint8x16_t __tlo = (__lo);					      \
 	uint8x16_t __thi = (__hi);					      \
 	uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,	      \
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 	uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,	      \
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
 	    (int8x16_t)__hi_r, (__i), 48);				      \
 	return __builtin_shufflevector(__r, __r,			      \
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN */
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32_t
 vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
+{
 #ifdef __aarch64__
 	return __v[__i];
 #else
 	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
 #endif
+}
 #elif defined(__clang__)
 #define	vgetq_lane_u32(__v, __i)					      \
 	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
 	    __neon_lane_index(__v, __i))
 #endif
 _INTRINSATTR
 static __inline uint32x4_t
 vld1q_u32(const uint32_t *__p32)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	const __builtin_aarch64_simd_si *__p =
 	    (const __builtin_aarch64_simd_si *)__p32;
 	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
 #else
 	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
 	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
 #endif
 #elif defined(__clang__)
 	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
 #endif
 	return __v;
 #endif
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vld1q_u8(const uint8_t *__p8)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	const __builtin_aarch64_simd_qi *__p =
 	    (const __builtin_aarch64_simd_qi *)__p8;
 	return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
 #else
 	const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
 	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
 #endif
 #elif defined(__clang__)
 	uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 	return __v;
 #endif
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	uint8x16_t __res;
 	__asm__("tbl %0.16b, {%1.16b}, %2.16b"
 	    : "=w"(__res) : "w"(__tab), "w"(__idx));
 	return __res;
 #else
 	/*
 	 * No native ARMv7 NEON instruction for this, so do it via two
 	 * half-width TBLs instead (vtbl2_u8 equivalent).
 	 */
 	uint64x2_t __tab64 = (uint64x2_t)__tab;
 	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
 	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
 	uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
 	union {
 		uint8x8x2_t __u8x8x2;
 		__builtin_neon_ti __ti;
 	} __u = { __tab8x8x2 };
 	uint64x2_t __idx64, __out64;
 	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
 	__idx64 = (uint64x2_t)__idx;
 	__idxlo = (int8x8_t)__idx64[0];
 	__idxhi = (int8x8_t)__idx64[1];
 	__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
 	__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
 	__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
 	return (uint8x16_t)__out64;
 #endif
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 	return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
 	    (int8x16_t)__idx, 48);
 #else
 	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 	uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
 	    (int8x16_t)__idx, __i, 48);
 	return __builtin_shufflevector(__r, __r,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 #endif
+}
 _INTRINSATTR
 static __inline int32x4_t
 vreinterpretq_s32_u8(uint8x16_t __v)
+{
 	return (int32x4_t)__v;
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u8(uint8x16_t __v)
+{
 	return (uint32x4_t)__v;
+}
 _INTRINSATTR
 static __inline uint64x2_t
 vreinterpretq_u64_u8(uint8x16_t __v)
+{
 	return (uint64x2_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vreinterpretq_u8_s32(int32x4_t __v)
+{
 	return (uint8x16_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vreinterpretq_u8_u32(uint32x4_t __v)
+{
 	return (uint8x16_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vreinterpretq_u8_u64(uint64x2_t __v)
+{
 	return (uint8x16_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vrev32q_u8(uint8x16_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 	return __builtin_shuffle(__v,
 	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
 #elif defined(__clang__)
 	return __builtin_shufflevector(__v,
 ,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
 #endif
+}
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
+{
 	__v[__neon_lane_index(__v, __i)] = __x;
 	return __v;
+}
 #elif defined(__clang__)
 #define	vsetq_lane_u32(__x, __v, __i)					      \
 	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
 	    __neon_lane_index(__v, __i))
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint64x2_t
 vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
+{
 	__v[__neon_lane_index(__v, __i)] = __x;
 	return __v;
+}
 #elif defined(__clang__)
 #define	vsetq_lane_u64(__x, __v, __i)					      \
 	(uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v),    \
 	    __neon_lane_index(__v, __i));
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
 #else
 	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
 #endif
+}
 #elif defined(__clang__)
 #define	vshlq_n_u32(__v, __bits)					      \
 	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
 #else
 	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
 #endif
+}
 #elif defined(__clang__)
 #define	vshrq_n_u8(__v, __bits)						      \
 	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint8x16_t
 vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
 #else
 	return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
 #endif
+}
 #elif defined(__clang__)
 #define	vshrq_n_u8(__v, __bits)						      \
 	(uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline int32x4_t
 vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
 #else
 	return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vsliq_n_s32(__vins, __vsh, __bits)				      \
 	(int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins),	      \
 	    (int32x4_t)(__vsh), (__bits), 34)
 #else
 #define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
 {									      \
 	int32x4_t __tvins = (__vins);					      \
 	int32x4_t __tvsh = (__vsh);					      \
 	uint8_t __tbits = (__bits);					      \
 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
 ,2,1,0);							      \
 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
 ,2,1,0);							      \
 	int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits,    \
 );							      \
 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN__ */
 #endif
 _INTRINSATTR
 static __inline void
 vst1q_u32(uint32_t *__p32, uint32x4_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
 	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
 #else
 	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
 	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
 #endif
 #elif defined(__clang__)
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
 #endif
 	__builtin_neon_vst1q_v(__p32, __v, 50);
 #endif
+}
 _INTRINSATTR
 static __inline void
 vst1q_u8(uint8_t *__p8, uint8x16_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
 	__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
 #else
 	__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
 	__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
 #endif
 #elif defined(__clang__)
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 	__builtin_neon_vst1q_v(__p8, __v, 48);
 #endif
+}
 #endif	/* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */