@@ -1,631 +1,665 @@
-/*	$NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $	*/
+/*	$NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #ifndef	_SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
 #define	_SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
 #if defined(__GNUC__) && !defined(__clang__)
 #define	_INTRINSATTR							      \
 	__extension__							      \
 	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
 #ifdef __aarch64__
 typedef __Int32x4_t int32x4_t;
 typedef __Int64x2_t int64x2_t;
 typedef __Int8x16_t int8x16_t;
 typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
 typedef __Uint64x2_t uint64x2_t;
 typedef __Uint8x16_t uint8x16_t;
 #else
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 typedef __simd128_int8_t int8x16_t;
 typedef __simd128_uint16_t uint16x8_t;
 typedef __simd128_uint32_t uint32x4_t;
 typedef __simd128_uint64_t uint64x2_t;
 typedef __simd128_uint8_t uint8x16_t;
 typedef __simd64_int8_t int8x8_t;
 typedef __simd64_uint8_t uint8x8_t;
 typedef __builtin_neon_udi uint64x1_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 #endif
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #else
 #define	__neon_lane_index(__v, __i)	__i
 #endif
 #elif defined(__clang__)
 #define	_INTRINSATTR							      \
 	__attribute__((__always_inline__, __nodebug__))
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 #ifdef __LITTLE_ENDIAN__
 #define	__neon_lane_index(__v, __i)	__i
 #else
 #define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
 #endif
 #else
 #error Teach me how to neon in your compile!
 #endif
 _INTRINSATTR
 static __inline uint32x4_t
 vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
+{
 	return __v0 + __v1;
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vcltq_s32(int32x4_t __v0, int32x4_t __v1)
+{
 	return (uint32x4_t)(__v0 < __v1);
+}
 _INTRINSATTR
 static __inline int32x4_t
 vdupq_n_s32(int32_t __x)
+{
 	return (int32x4_t) { __x, __x, __x, __x };
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vdupq_n_u32(uint32_t __x)
+{
 	return (uint32x4_t) { __x, __x, __x, __x };
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vdupq_n_u8(uint8_t __x)
+{
 	return (uint8x16_t) {
 		__x, __x, __x, __x, __x, __x, __x, __x,
 		__x, __x, __x, __x, __x, __x, __x, __x,
 	};
+}
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
+{
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 	return __builtin_shuffle(__hi, __lo,
 	    (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
 #else
 	return __builtin_shuffle(__lo, __hi,
 	    (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vextq_u32(__lo, __hi, __i)					      \
 	(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
 	    (int8x16_t)(__hi), (__i), 50)
 #else
 #define	vextq_u32(__lo, __hi, __i) (					      \
 {									      \
 	uint32x4_t __tlo = (__lo);					      \
 	uint32x4_t __thi = (__hi);					      \
 	uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0);   \
 	uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0);   \
 	uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
 	    (int8x16_t)__hi_r, __i, 50);				      \
 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN__ */
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint8x16_t
 vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
+{
 #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
 	return __builtin_shuffle(__hi, __lo,
 	    (uint8x16_t) {
 - __i, 17 - __i, 18 - __i, 19 - __i,
 - __i, 21 - __i, 22 - __i, 23 - __i,
 - __i, 25 - __i, 26 - __i, 27 - __i,
 - __i, 29 - __i, 30 - __i, 31 - __i,
 	});
 #else
 	return __builtin_shuffle(__lo, __hi,
 	    (uint8x16_t) {
 		__i +  0, __i +  1, __i +  2, __i +  3,
 		__i +  4, __i +  5, __i +  6, __i +  7,
 		__i +  8, __i +  9, __i + 10, __i + 11,
 		__i + 12, __i + 13, __i + 14, __i + 15,
 	});
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vextq_u8(__lo, __hi, __i)					      \
 	(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo),		      \
 	    (int8x16_t)(__hi), (__i), 48)
 #else
 #define	vextq_u8(__lo, __hi, __i) (					      \
 {									      \
 	uint8x16_t __tlo = (__lo);					      \
 	uint8x16_t __thi = (__hi);					      \
 	uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo,	      \
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 	uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi,	      \
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 	uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r,	      \
 	    (int8x16_t)__hi_r, (__i), 48);				      \
 	return __builtin_shufflevector(__r, __r,			      \
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN */
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32_t
 vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
+{
 #ifdef __aarch64__
 	return __v[__i];
 #else
 	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
 #endif
+}
 #elif defined(__clang__)
 #define	vgetq_lane_u32(__v, __i)					      \
 	(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v),	      \
 	    __neon_lane_index(__v, __i))
 #endif
 _INTRINSATTR
 static __inline uint32x4_t
 vld1q_u32(const uint32_t *__p32)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	const __builtin_aarch64_simd_si *__p =
 	    (const __builtin_aarch64_simd_si *)__p32;
 	return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
 #else
 	const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
 	return (uint32x4_t)__builtin_neon_vld1v4si(__p);
 #endif
 #elif defined(__clang__)
 	uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
 #endif
 	return __v;
 #endif
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vld1q_u8(const uint8_t *__p8)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	const __builtin_aarch64_simd_qi *__p =
 	    (const __builtin_aarch64_simd_qi *)__p8;
 	return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
 #else
 	const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
 	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
 #endif
 #elif defined(__clang__)
 	uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 	return __v;
 #endif
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	uint8x16_t __res;
 	__asm__("tbl %0.16b, {%1.16b}, %2.16b"
 	    : "=w"(__res) : "w"(__tab), "w"(__idx));
 	return __res;
 #else
 	/*
 	 * No native ARMv7 NEON instruction for this, so do it via two
 	 * half-width TBLs instead (vtbl2_u8 equivalent).
 	 */
 	uint64x2_t __tab64 = (uint64x2_t)__tab;
 	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
 	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
 	uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
 	union {
 		uint8x8x2_t __u8x8x2;
 		__builtin_neon_ti __ti;
 	} __u = { __tab8x8x2 };
 	uint64x2_t __idx64, __out64;
 	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
 	__idx64 = (uint64x2_t)__idx;
 	__idxlo = (int8x8_t)__idx64[0];
 	__idxhi = (int8x8_t)__idx64[1];
 	__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
 	__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
 	__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
 	return (uint8x16_t)__out64;
 #endif
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 	return (uint8x16_t)__builtin_neon_vqtbl1q_v((int8x16_t)__tab,
 	    (int8x16_t)__idx, 48);
 #else
 	uint32x4_t __lo_r = __builtin_shufflevector(__lo, __lo,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 	uint32x4_t __hi_r = __builtin_shufflevector(__hi, __hi,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 	uint32x4_t __r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab,
 	    (int8x16_t)__idx, __i, 48);
 	return __builtin_shufflevector(__r, __r,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 #endif
+}
 _INTRINSATTR
 static __inline int32x4_t
 vreinterpretq_s32_u8(uint8x16_t __v)
+{
 	return (int32x4_t)__v;
+}
 _INTRINSATTR
 static __inline uint16x8_t
 vreinterpretq_u16_u32(uint32x4_t __v)
+{
 	return (uint16x8_t)__v;
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u16(uint16x8_t __v)
+{
 	return (uint32x4_t)__v;
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u64(uint64x2_t __v)
+{
 	return (uint32x4_t)__v;
+}
 _INTRINSATTR
 static __inline uint32x4_t
 vreinterpretq_u32_u8(uint8x16_t __v)
+{
 	return (uint32x4_t)__v;
+}
 _INTRINSATTR
 static __inline uint64x2_t
 vreinterpretq_u64_u32(uint32x4_t __v)
+{
 	return (uint64x2_t)__v;
+}
 _INTRINSATTR
 static __inline uint64x2_t
 vreinterpretq_u64_u8(uint8x16_t __v)
+{
 	return (uint64x2_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vreinterpretq_u8_s32(int32x4_t __v)
+{
 	return (uint8x16_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vreinterpretq_u8_u32(uint32x4_t __v)
+{
 	return (uint8x16_t)__v;
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vreinterpretq_u8_u64(uint64x2_t __v)
+{
 	return (uint8x16_t)__v;
+}
 _INTRINSATTR
 static __inline uint16x8_t
 vrev32q_u16(uint16x8_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 	return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
 #elif defined(__clang__)
 	return __builtin_shufflevector(__v,  1,0, 3,2, 5,4, 7,6);
 #endif
+}
 _INTRINSATTR
 static __inline uint8x16_t
 vrev32q_u8(uint8x16_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 	return __builtin_shuffle(__v,
 	    (uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
 #elif defined(__clang__)
 	return __builtin_shufflevector(__v,
 ,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
 #endif
+}
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
+{
 	__v[__neon_lane_index(__v, __i)] = __x;
 	return __v;
+}
 #elif defined(__clang__)
 #define	vsetq_lane_u32(__x, __v, __i)					      \
 	(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v),    \
 	    __neon_lane_index(__v, __i))
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint64x2_t
 vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
+{
 	__v[__neon_lane_index(__v, __i)] = __x;
 	return __v;
+}
 #elif defined(__clang__)
 #define	vsetq_lane_u64(__x, __v, __i)					      \
 	(uint64x2_t)__builtin_neon_vsetq_lane_i32((__x), (int64x2_t)(__v),    \
 	    __neon_lane_index(__v, __i));
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
 #else
 	return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
 #endif
+}
 #elif defined(__clang__)
 #define	vshlq_n_u32(__v, __bits)					      \
 	(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
 #else
 	return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
 #endif
+}
 #elif defined(__clang__)
 #define	vshrq_n_u8(__v, __bits)						      \
 	(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint8x16_t
 vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
 #else
 	return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
 #endif
+}
 #elif defined(__clang__)
 #define	vshrq_n_u8(__v, __bits)						      \
 	(uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline int32x4_t
 vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
 #else
 	return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vsliq_n_s32(__vins, __vsh, __bits)				      \
 	(int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins),	      \
 	    (int32x4_t)(__vsh), (__bits), 34)
 #else
 #define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
 {									      \
 	int32x4_t __tvins = (__vins);					      \
 	int32x4_t __tvsh = (__vsh);					      \
 	uint8_t __tbits = (__bits);					      \
 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
 ,2,1,0);							      \
 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
 ,2,1,0);							      \
 	int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits,    \
 );							      \
 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
 })
 #endif	/* __LITTLE_ENDIAN__ */
 #endif
 #if defined(__GNUC__) && !defined(__clang__)
 _INTRINSATTR
 static __inline uint32x4_t
 vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
+{
 #ifdef __aarch64__
 	return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
 #else
 	return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
 	    (int32x4_t)__vsh, __bits);
 #endif
+}
 #elif defined(__clang__)
 #ifdef __LITTLE_ENDIAN__
 #define	vsriq_n_u32(__vins, __vsh, __bits)				      \
 	(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),	      \
 	    (int32x4_t)(__vsh), (__bits), 34)
 #else
 #define	vsliq_n_s32(__vins, __vsh, __bits) (				      \
 {									      \
 	int32x4_t __tvins = (__vins);					      \
 	int32x4_t __tvsh = (__vsh);					      \
 	uint8_t __tbits = (__bits);					      \
 	int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,	      \
 ,2,1,0);							      \
 	int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,	      \
 ,2,1,0);							      \
 	int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,    \
 );							      \
 	__builtin_shufflevector(__r, __r, 3,2,1,0);			      \
 })
 #endif
 #endif
 _INTRINSATTR
 static __inline void
 vst1q_u32(uint32_t *__p32, uint32x4_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
 	__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
 #else
 	__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
 	__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
 #endif
 #elif defined(__clang__)
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
 #endif
 	__builtin_neon_vst1q_v(__p32, __v, 50);
 #endif
+}
 _INTRINSATTR
 static __inline void
 vst1q_u8(uint8_t *__p8, uint8x16_t __v)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 #ifdef __aarch64__
 	__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
 	__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
 #else
 	__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
 	__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
 #endif
 #elif defined(__clang__)
 #ifndef __LITTLE_ENDIAN__
 	__v = __builtin_shufflevector(__v, __v,
 ,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
 #endif
 	__builtin_neon_vst1q_v(__p8, __v, 48);
 #endif
+}
 #ifndef __aarch64__		/* XXX */
 _INTRINSATTR
 static __inline uint8x8_t
 vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 	return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
 	    (int8x8_t)__idx);
 #elif defined(__clang__)
 	uint8x8_t __ret;
 #ifndef __LITTLE_ENDIAN__
 	__tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
 	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
 #endif
 	__ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
 	    (int8x8_t)__idx, 16);
 #ifndef __LITTLE_ENDIAN__
 	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
 #endif
 	return __ret;
 #endif
+}
 _INTRINSATTR
 static __inline uint8x8_t
 vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
+{
 #if defined(__GNUC__) && !defined(__clang__)
 	union {
 		uint8x8x2_t __u8x8x82;
 		__builtin_neon_ti __ti;
 	} __u = { __tab };
 	return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
 #elif defined(__clang__)
 	uint8x8_t __ret;
 #ifndef __LITTLE_ENDIAN__
 	__tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
 ,6,5,4,3,2,1,0);
 	__tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
 ,6,5,4,3,2,1,0);
 	__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
 #endif
 	__ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
 	    (int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
 #ifndef __LITTLE_ENDIAN__
 	__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
 #endif
 	return __ret;
 #endif
+}
 #endif	/* !defined(__aarch64__) */
 #endif	/* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */

 @@ -1,369 +1,377 @@
-/*	$NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #include <sys/types.h>
 #include <sys/endian.h>
 #include "arm_neon.h"
 #include "chacha_neon.h"
 static inline uint32x4_t
 vrolq_n_u32(uint32x4_t x, uint8_t n)
+{
 	/*
 	 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
 	 * practice it hurts performance at least on Cortex-A8.
 	 */
 #if 1
 	return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
 #else
 	return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
 #endif
+}
 static inline uint32x4_t
 vhtole_u32(uint32x4_t x)
+{
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	return x;
 #elif _BYTE_ORDER == _BIG_ENDIAN
 	return vrev32q_u8(x);
 #endif
+}
 static inline uint32x4_t
 vletoh_u32(uint32x4_t x)
+{
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 	return x;
 #elif _BYTE_ORDER == _BIG_ENDIAN
 	return vrev32q_u8(x);
 #endif
+}
 static inline uint32x4_t
 rol16(uint32x4_t x)
+{
 	uint16x8_t y16, x16 = vreinterpretq_u16_u32(x);
 	y16 = vrev32q_u16(x16);
 	return vreinterpretq_u32_u16(y16);
+}
 static inline uint32x4_t
 rol12(uint32x4_t x)
+{
 	return vrolq_n_u32(x, 12);
+}
 static inline uint32x4_t
 rol8(uint32x4_t x)
+{
 #if defined(__aarch64__)
 	static const uint8x16_t rol8_tab = {
 , 0, 1, 2,  7, 4, 5, 6,
 , 8, 9,10, 15,12,13,14,
 	};
 	uint8x16_t y8, x8 = vreinterpretq_u8_u32(x);
 	y8 = vqtbl1q_u8(x8, rol8_tab);
 	return vreinterpretq_u32_u8(y8);
 #elif 0
 	/*
 	 * GCC does a lousy job with this, spilling two 64-bit vector
 	 * registers to the stack every time.  There should be plenty
 	 * of vector registers free, requiring no spills at all, and
 	 * GCC should be able to hoist the load of rol8_tab out of any
 	 * loops, but it doesn't and so attempting to use VTBL hurts
 	 * more than it helps.
 	 */
 	static const uint8x8_t rol8_tab = {
 , 0, 1, 2,  7, 4, 5, 6,
 	};
 	uint64x2_t y64, x64 = vreinterpretq_u64_u32(x);
 	y64 = (uint64x2_t) {
 		(uint64_t)vtbl1_u8((uint8x8_t)x64[0], rol8_tab),
 		(uint64_t)vtbl1_u8((uint8x8_t)x64[1], rol8_tab),
 	};
 	return vreinterpretq_u32_u64(y64);
 #else
 	return vrolq_n_u32(x, 8);
 #endif
+}
 static inline uint32x4_t
 rol7(uint32x4_t x)
+{
 	return vrolq_n_u32(x, 7);
+}
 static inline void
 chacha_permute(uint32x4_t *p0, uint32x4_t *p1, uint32x4_t *p2, uint32x4_t *p3,
     unsigned nr)
+{
 	uint32x4_t r0, r1, r2, r3;
 	uint32x4_t c0, c1, c2, c3;
 	r0 = *p0;
 	r1 = *p1;
 	r2 = *p2;
 	r3 = *p3;
 	for (; nr > 0; nr -= 2) {
 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol16(r3);
 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol12(r1);
 		r0 = vaddq_u32(r0, r1); r3 ^= r0; r3 = rol8(r3);
 		r2 = vaddq_u32(r2, r3); r1 ^= r2; r1 = rol7(r1);
 		c0 = r0;
 		c1 = vextq_u32(r1, r1, 1);
 		c2 = vextq_u32(r2, r2, 2);
 		c3 = vextq_u32(r3, r3, 3);
 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol16(c3);
 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol12(c1);
 		c0 = vaddq_u32(c0, c1); c3 ^= c0; c3 = rol8(c3);
 		c2 = vaddq_u32(c2, c3); c1 ^= c2; c1 = rol7(c1);
 		r0 = c0;
 		r1 = vextq_u32(c1, c1, 3);
 		r2 = vextq_u32(c2, c2, 2);
 		r3 = vextq_u32(c3, c3, 1);
+	}
 	*p0 = r0;
 	*p1 = r1;
 	*p2 = r2;
 	*p3 = r3;
+}
 void
 chacha_core_neon(uint8_t out[restrict static 64],
     const uint8_t in[static 16],
     const uint8_t k[static 32],
     const uint8_t c[static 16],
     unsigned nr)
+{
 	uint32x4_t in0, in1, in2, in3;
 	uint32x4_t r0, r1, r2, r3;
 	r0 = in0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
 	r1 = in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
 	r2 = in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
 	r3 = in3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
 	chacha_permute(&r0, &r1, &r2, &r3, nr);
 	vst1q_u32((uint32_t *)out + 0, vhtole_u32(vaddq_u32(r0, in0)));
 	vst1q_u32((uint32_t *)out + 4, vhtole_u32(vaddq_u32(r1, in1)));
 	vst1q_u32((uint32_t *)out + 8, vhtole_u32(vaddq_u32(r2, in2)));
 	vst1q_u32((uint32_t *)out + 12, vhtole_u32(vaddq_u32(r3, in3)));
+}
 void
 hchacha_neon(uint8_t out[restrict static 32],
     const uint8_t in[static 16],
     const uint8_t k[static 32],
     const uint8_t c[static 16],
     unsigned nr)
+{
 	uint32x4_t r0, r1, r2, r3;
 	r0 = vletoh_u32(vld1q_u32((const uint32_t *)c));
 	r1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
 	r2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
 	r3 = vletoh_u32(vld1q_u32((const uint32_t *)in));
 	chacha_permute(&r0, &r1, &r2, &r3, nr);
 	vst1q_u32((uint32_t *)out + 0, r0);
 	vst1q_u32((uint32_t *)out + 4, r3);
+}
 void
 chacha_stream_neon(uint8_t *restrict s, size_t n,
     uint32_t blkno,
     const uint8_t nonce[static 12],
     const uint8_t k[static 32],
     unsigned nr)
+{
 #ifdef __aarch64__
 	for (; n >= 256; s += 256, n -= 256, blkno += 4)
 		chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
 #endif
 	if (n) {
 		const uint32x4_t blkno_inc = {1,0,0,0};
 		uint32x4_t in0, in1, in2, in3;
 		uint32x4_t r0, r1, r2, r3;
 		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
 		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
 		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
 		in3 = (uint32x4_t) {
 			blkno,
 			le32dec(nonce),
 			le32dec(nonce + 4),
 			le32dec(nonce + 8)
 		};
 		for (; n; s += 64, n -= 64) {
 			r0 = in0;
 			r1 = in1;
 			r2 = in2;
 			r3 = in3;
 			chacha_permute(&r0, &r1, &r2, &r3, nr);
 			r0 = vhtole_u32(vaddq_u32(r0, in0));
 			r1 = vhtole_u32(vaddq_u32(r1, in1));
 			r2 = vhtole_u32(vaddq_u32(r2, in2));
 			r3 = vhtole_u32(vaddq_u32(r3, in3));
 			if (n < 64) {
 				uint8_t buf[64] __aligned(16);
 				vst1q_u32((uint32_t *)buf + 4*0, r0);
 				vst1q_u32((uint32_t *)buf + 4*1, r1);
 				vst1q_u32((uint32_t *)buf + 4*2, r2);
 				vst1q_u32((uint32_t *)buf + 4*3, r3);
 				memcpy(s, buf, n);
 				break;
+			}
 			vst1q_u32((uint32_t *)s + 4*0, r0);
 			vst1q_u32((uint32_t *)s + 4*1, r1);
 			vst1q_u32((uint32_t *)s + 4*2, r2);
 			vst1q_u32((uint32_t *)s + 4*3, r3);
 			in3 = vaddq_u32(in3, blkno_inc);
+		}
+	}
+}
 void
 chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
     uint32_t blkno,
     const uint8_t nonce[static 12],
     const uint8_t k[static 32],
     unsigned nr)
+{
 #ifdef __aarch64__
 	for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
 		chacha_stream_xor256_neon(s, p, blkno, nonce, k,
 		    chacha_const32, nr);
 #endif
 	if (n) {
 		const uint32x4_t blkno_inc = {1,0,0,0};
 		uint32x4_t in0, in1, in2, in3;
 		uint32x4_t r0, r1, r2, r3;
 		in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
 		in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
 		in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
 		in3 = (uint32x4_t) {
 			blkno,
 			le32dec(nonce),
 			le32dec(nonce + 4),
 			le32dec(nonce + 8)
 		};
 		for (; n; s += 64, p += 64, n -= 64) {
 			r0 = in0;
 			r1 = in1;
 			r2 = in2;
 			r3 = in3;
 			chacha_permute(&r0, &r1, &r2, &r3, nr);
 			r0 = vhtole_u32(vaddq_u32(r0, in0));
 			r1 = vhtole_u32(vaddq_u32(r1, in1));
 			r2 = vhtole_u32(vaddq_u32(r2, in2));
 			r3 = vhtole_u32(vaddq_u32(r3, in3));
 			if (n < 64) {
 				uint8_t buf[64] __aligned(16);
 				unsigned i;
 				vst1q_u32((uint32_t *)buf + 4*0, r0);
 				vst1q_u32((uint32_t *)buf + 4*1, r1);
 				vst1q_u32((uint32_t *)buf + 4*2, r2);
 				vst1q_u32((uint32_t *)buf + 4*3, r3);
 				for (i = 0; i < n - n%4; i += 4)
 					le32enc(s + i,
 					    le32dec(p + i) ^ le32dec(buf + i));
 				for (; i < n; i++)
 					s[i] = p[i] ^ buf[i];
 				break;
+			}
 			r0 ^= vld1q_u32((const uint32_t *)p + 4*0);
 			r1 ^= vld1q_u32((const uint32_t *)p + 4*1);
 			r2 ^= vld1q_u32((const uint32_t *)p + 4*2);
 			r3 ^= vld1q_u32((const uint32_t *)p + 4*3);
 			vst1q_u32((uint32_t *)s + 4*0, r0);
 			vst1q_u32((uint32_t *)s + 4*1, r1);
 			vst1q_u32((uint32_t *)s + 4*2, r2);
 			vst1q_u32((uint32_t *)s + 4*3, r3);
 			in3 = vaddq_u32(in3, blkno_inc);
+		}
+	}
+}
 void
 xchacha_stream_neon(uint8_t *restrict s, size_t nbytes,
     uint32_t blkno,
     const uint8_t nonce[static 24],
     const uint8_t k[static 32],
     unsigned nr)
+{
 	uint8_t subkey[32];
 	uint8_t subnonce[12];
 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
 	memset(subnonce, 0, 4);
 	memcpy(subnonce + 4, nonce + 16, 8);
 	chacha_stream_neon(s, nbytes, blkno, subnonce, subkey, nr);
+}
 void
 xchacha_stream_xor_neon(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
     uint32_t blkno,
     const uint8_t nonce[static 24],
     const uint8_t k[static 32],
     unsigned nr)
+{
 	uint8_t subkey[32];
 	uint8_t subnonce[12];
 	hchacha_neon(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
 	memset(subnonce, 0, 4);
 	memcpy(subnonce + 4, nonce + 16, 8);
 	chacha_stream_xor_neon(c, p, nbytes, blkno, subnonce, subkey, nr);
+}