@@ -1,14 +1,14 @@
-/*	$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $	*/
+/*	$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
 @@ -29,27 +29,27 @@
 /*
  * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
  * software, at <https://crypto.stanford.edu/vpaes/>, described in
+ *
  *	Mike Hamburg, `Accelerating AES with Vector Permute
  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
  *	Springer LNCS 5747, pp. 18-32.
+ *
  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
  */
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 #include <sys/types.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #else
 #include <err.h>
 #define	panic(fmt, args...)		err(1, fmt, ##args)
 #endif
 #include "aes_neon_impl.h"
 #ifdef __aarch64__
 @@ -579,26 +579,79 @@ aes_neon_enc1(const struct aesenc *enc,
 		A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
 		A ^= loadroundkey(rk32);
 		A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
 		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
 		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
 		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
+	}
 	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
 	x ^= loadroundkey(rk32);
 	return vqtbl1q_u8(x, sr[rmod4]);
+}
 uint8x16x2_t
 aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
 	uint8x16_t io0, jo0, io1, jo1;
 	unsigned rmod4 = 0;
 	x0 = aes_schedule_transform(x0, ipt);
 	x1 = aes_schedule_transform(x1, ipt);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	for (;;) {
 		uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
 		uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
 		subbytes(&io0, &jo0, x0, inv_, inva_);
 		subbytes(&io1, &jo1, x1, inv_, inva_);
 		rk32 += 4;
 		rmod4 = (rmod4 + 1) % 4;
 		if (--nrounds == 0)
 			break;
 		A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
 		A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
 		A_0 ^= loadroundkey(rk32);
 		A_1 ^= loadroundkey(rk32);
 		A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
 		A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
 		A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
 		A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
 		A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
 		A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
 		x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
 		x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
+	}
 	x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
 	x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	return (uint8x16x2_t) { .val = {
 		[0] = vqtbl1q_u8(x0, sr[rmod4]),
 		[1] = vqtbl1q_u8(x1, sr[rmod4]),
 	} };
+}
 uint8x16_t
 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
 	unsigned i = 3 & ~(nrounds - 1);
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t io, jo, mc;
 	x = aes_schedule_transform(x, dipt);
 	x ^= loadroundkey(rk32);
 	rk32 += 4;
 @@ -618,14 +671,70 @@ aes_neon_dec1(const struct aesdec *dec,
 		x = vqtbl1q_u8(x, mc);
 		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
 		x = vqtbl1q_u8(x, mc);
 		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
 		mc = vextq_u8(mc, mc, 12);
+	}
 	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
 	x ^= loadroundkey(rk32);
 	return vqtbl1q_u8(x, sr[i]);
+}
 uint8x16x2_t
 aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
+{
 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
 	unsigned i = 3 & ~(nrounds - 1);
 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
 	uint8x16_t io0, jo0, io1, jo1, mc;
 	x0 = aes_schedule_transform(x0, dipt);
 	x1 = aes_schedule_transform(x1, dipt);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	rk32 += 4;
 	mc = mc_forward[3];
 	for (;;) {
 		subbytes(&io0, &jo0, x0, inv_, inva_);
 		subbytes(&io1, &jo1, x1, inv_, inva_);
 		if (--nrounds == 0)
 			break;
 		x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
 		x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
 		x0 ^= loadroundkey(rk32);
 		x1 ^= loadroundkey(rk32);
 		rk32 += 4;				/* next round key */
 		x0 = vqtbl1q_u8(x0, mc);
 		x1 = vqtbl1q_u8(x1, mc);
 		x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
 		x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
 		x0 = vqtbl1q_u8(x0, mc);
 		x1 = vqtbl1q_u8(x1, mc);
 		x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
 		x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
 		x0 = vqtbl1q_u8(x0, mc);
 		x1 = vqtbl1q_u8(x1, mc);
 		x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
 		x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
 		mc = vextq_u8(mc, mc, 12);
+	}
 	x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
 	x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
 	x0 ^= loadroundkey(rk32);
 	x1 ^= loadroundkey(rk32);
 	return (uint8x16x2_t) { .val = {
 		[0] = vqtbl1q_u8(x0, sr[i]),
 		[1] = vqtbl1q_u8(x1, sr[i]),
 	} };
+}
 #endif

 @@ -1,14 +1,14 @@
-/*	$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
 @@ -17,27 +17,27 @@
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 #include <sys/endian.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <lib/libkern/libkern.h>
 #else
 #include <assert.h>
 #include <inttypes.h>
 #include <stdio.h>
 #define	KASSERT			assert
 #endif
 @@ -101,34 +101,53 @@ void
 aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t iv0, cv, b;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	iv0 = loadblock(iv);
 	cv = loadblock(in + nbytes - 16);
 	storeblock(iv, cv);
-	for (;;) {
+	if (nbytes % 32) {
 		KASSERT(nbytes % 32 == 16);
 		b = aes_neon_dec1(dec, cv, nrounds);
 		if ((nbytes -= 16) == 0)
-			break;
+			goto out;
 		cv = loadblock(in + nbytes - 16);
 		storeblock(out + nbytes, cv ^ b);
+	}
 	for (;;) {
 		uint8x16x2_t b2;
 		KASSERT(nbytes >= 32);
 		b2.val[1] = cv;
 		b2.val[0] = cv = loadblock(in + nbytes - 32);
 		b2 = aes_neon_dec2(dec, b2, nrounds);
 		storeblock(out + nbytes - 16, cv ^ b2.val[1]);
 		if ((nbytes -= 32) == 0) {
 			b = b2.val[0];
 			goto out;
+		}
 		cv = loadblock(in + nbytes - 16);
-		storeblock(out + nbytes, b ^ cv);
+		storeblock(out + nbytes, cv ^ b2.val[0]);
+	}
 	storeblock(out, b ^ iv0);
 out:	storeblock(out, b ^ iv0);
+}
 static inline uint8x16_t
 aes_neon_xts_update(uint8x16_t t8)
+{
 	const int32x4_t zero = vdupq_n_s32(0);
 	const int32x4_t carry = {0x87, 1, 1, 1};
 	int32x4_t t, t_;
 	uint32x4_t mask;
 	t = vreinterpretq_s32_u8(t8);
 	mask = vcltq_s32(t, zero);		/* -1 if high bit set else 0 */
 	mask = vextq_u32(mask, mask, 3);	/* rotate quarters */
 @@ -176,51 +195,85 @@ aes_neon_xts_update_selftest(void)
+}
 void
 aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t t, b;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	t = loadblock(tweak);
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+	if (nbytes % 32) {
 		KASSERT(nbytes % 32 == 16);
 		b = t ^ loadblock(in);
 		b = aes_neon_enc1(enc, b, nrounds);
 		storeblock(out, t ^ b);
 		t = aes_neon_xts_update(t);
 		nbytes -= 16;
 		in += 16;
 		out += 16;
+	}
 	for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 		uint8x16_t t1;
 		uint8x16x2_t b2;
 		t1 = aes_neon_xts_update(t);
 		b2.val[0] = t ^ loadblock(in);
 		b2.val[1] = t1 ^ loadblock(in + 16);
 		b2 = aes_neon_enc2(enc, b2, nrounds);
 		storeblock(out, b2.val[0] ^ t);
 		storeblock(out + 16, b2.val[1] ^ t1);
 		t = aes_neon_xts_update(t1);
+	}
 	storeblock(tweak, t);
+}
 void
 aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
     uint32_t nrounds)
+{
 	uint8x16_t t, b;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	t = loadblock(tweak);
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+	if (nbytes % 32) {
 		KASSERT(nbytes % 32 == 16);
 		b = t ^ loadblock(in);
 		b = aes_neon_dec1(dec, b, nrounds);
 		storeblock(out, t ^ b);
 		t = aes_neon_xts_update(t);
 		nbytes -= 16;
 		in += 16;
 		out += 16;
+	}
 	for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 		uint8x16_t t1;
 		uint8x16x2_t b2;
 		t1 = aes_neon_xts_update(t);
 		b2.val[0] = t ^ loadblock(in);
 		b2.val[1] = t1 ^ loadblock(in + 16);
 		b2 = aes_neon_dec2(dec, b2, nrounds);
 		storeblock(out, b2.val[0] ^ t);
 		storeblock(out + 16, b2.val[1] ^ t1);
 		t = aes_neon_xts_update(t1);
+	}
 	storeblock(tweak, t);
+}
 void
 aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
     size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
+{
 	uint8x16_t auth;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 @@ -252,58 +305,78 @@ aes_neon_ccm_enc1(const struct aesenc *e
     uint32_t nrounds)
+{
 	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
 	uint8x16_t auth, ptxt, ctr_be;
 	uint32x4_t ctr;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	auth = loadblock(authctr);
 	ctr_be = loadblock(authctr + 16);
 	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 		uint8x16x2_t b2;
 		ptxt = loadblock(in);
 		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
 		ctr = vaddq_u32(ctr, ctr32_inc);
 		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
 		storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds));
 		b2.val[0] = auth ^ ptxt;
 		b2.val[1] = ctr_be;
 		b2 = aes_neon_enc2(enc, b2, nrounds);
 		auth = b2.val[0];
 		storeblock(out, ptxt ^ b2.val[1]);
+	}
 	storeblock(authctr, auth);
 	storeblock(authctr + 16, ctr_be);
+}
 void
 aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
     uint32_t nrounds)
+{
 	const uint32x4_t ctr32_inc = {0, 0, 0, 1};
-	uint8x16_t auth, ctr_be, ptxt;
+	uint8x16_t auth, ctr_be, ptxt, pad;
 	uint32x4_t ctr;
 	KASSERT(nbytes);
 	KASSERT(nbytes % 16 == 0);
 	auth = loadblock(authctr);
 	ctr_be = loadblock(authctr + 16);
 	ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
-	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+	ctr = vaddq_u32(ctr, ctr32_inc);
 	ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
 	pad = aes_neon_enc1(enc, ctr_be, nrounds);
 	auth = loadblock(authctr);
 	for (;; in += 16, out += 16) {
 		uint8x16x2_t b2;
 		ptxt = loadblock(in) ^ pad;
 		auth ^= ptxt;
 		storeblock(out, ptxt);
 		if ((nbytes -= 16) == 0)
 			break;
 		ctr = vaddq_u32(ctr, ctr32_inc);
 		ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
-		ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds);
+		b2.val[0] = auth;
-		storeblock(out, ptxt);
+		b2.val[1] = ctr_be;
-		auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds);
+		b2 = aes_neon_enc2(enc, b2, nrounds);
 		auth = b2.val[0];
 		pad = b2.val[1];
+	}
 	auth = aes_neon_enc1(enc, auth, nrounds);
 	storeblock(authctr, auth);
 	storeblock(authctr + 16, ctr_be);
+}
 int
 aes_neon_selftest(void)
+{
 	if (aes_neon_xts_update_selftest())
 		return -1;
 	return 0;
+}

 @@ -1,14 +1,14 @@
-/*	$NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $	*/
+/*	$NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
 @@ -29,14 +29,43 @@
 #ifndef	_CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H
 #define	_CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H
 #include <sys/types.h>
 #include "arm_neon.h"
 #include <crypto/aes/aes.h>
 #include <crypto/aes/arch/arm/aes_neon.h>
 uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned);
 uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned);
 #ifdef __aarch64__
 uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned);
 uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned);
 #else
 static inline uint8x16x2_t
 aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds)
+{
 	return (uint8x16x2_t) { .val = {
 		[0] = aes_neon_enc1(enc, b2.val[0], nrounds),
 		[1] = aes_neon_enc1(enc, b2.val[1], nrounds),
 	} };
+}
 static inline uint8x16x2_t
 aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds)
+{
 	return (uint8x16x2_t) { .val = {
 		[0] = aes_neon_dec1(dec, b2.val[0], nrounds),
 		[1] = aes_neon_dec1(dec, b2.val[1], nrounds),
 	} };
+}
 #endif
 #endif	/* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */

 @@ -1,14 +1,14 @@
-/*	$NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
 @@ -32,38 +32,40 @@
 #if defined(__GNUC__) && !defined(__clang__)
 #define	_INTRINSATTR							      \
 	__extension__							      \
 	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
 #ifdef __aarch64__
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
 typedef __Int8x16_t int8x16_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #else
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 typedef __simd128_int8_t int8x16_t;
 typedef __simd128_uint32_t uint32x4_t;
 typedef __simd128_uint64_t uint64x2_t;
 typedef __simd128_uint8_t uint8x16_t;
 typedef __simd64_int8_t int8x8_t;
 typedef __simd64_uint8_t uint8x8_t;
 typedef __builtin_neon_udi uint64x1_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
 #endif
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #else
 #define	__neon_lane_index(__v, __i)	__i
 #endif
 #elif defined(__clang__)
 #define	_INTRINSATTR							      \
 	__attribute__((__always_inline__, __nodebug__))