@@ -1,4 +1,4 @@
-/* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */
+/* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -527,6 +527,40 @@
__builtin_shufflevector(__r, __r, 3,2,1,0); \
})
#endif /* __LITTLE_ENDIAN__ */
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+ return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
+#else
+ return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
+ (int32x4_t)__vsh, __bits);
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u32(__vins, __vsh, __bits) \
+ (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \
+ (int32x4_t)(__vsh), (__bits), 34)
+#else
+#define vsliq_n_s32(__vins, __vsh, __bits) ( \
+{ \
+ int32x4_t __tvins = (__vins); \
+ int32x4_t __tvsh = (__vsh); \
+ uint8_t __tbits = (__bits); \
+ int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
+ 3,2,1,0); \
+ int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
+ 3,2,1,0); \
+ int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \
+ 34); \
+ __builtin_shufflevector(__r, __r, 3,2,1,0); \
+})
+#endif
#endif
_INTRINSATTR
@@ -1,4 +1,4 @@
-/* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */
+/* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -36,7 +36,15 @@
vrolq_n_u32(uint32x4_t x, uint8_t n)
{
+ /*
+ * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
+ * practice it hurts performance at least on Cortex-A8.
+ */
+#if 1
return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
+#else
+ return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
+#endif
}
static inline uint32x4_t