Mon Jul 27 20:58:56 2020 UTC ()
Note that VSRI seems to hurt here.


(riastradh)
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h
diff -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c

cvs diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/arm_neon.h (expand / switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/arm_neon.h 2020/07/27 20:58:06 1.2
+++ src/sys/crypto/chacha/arch/arm/arm_neon.h 2020/07/27 20:58:56 1.3
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $ */ 1/* $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -519,26 +519,60 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t  @@ -519,26 +519,60 @@ vsliq_n_s32(int32x4_t __vins, int32x4_t
519 int32x4_t __tvsh = (__vsh); \ 519 int32x4_t __tvsh = (__vsh); \
520 uint8_t __tbits = (__bits); \ 520 uint8_t __tbits = (__bits); \
521 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \ 521 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
522 3,2,1,0); \ 522 3,2,1,0); \
523 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \ 523 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
524 3,2,1,0); \ 524 3,2,1,0); \
525 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \ 525 int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \
526 34); \ 526 34); \
527 __builtin_shufflevector(__r, __r, 3,2,1,0); \ 527 __builtin_shufflevector(__r, __r, 3,2,1,0); \
528}) 528})
529#endif /* __LITTLE_ENDIAN__ */ 529#endif /* __LITTLE_ENDIAN__ */
530#endif 530#endif
531 531
 532#if defined(__GNUC__) && !defined(__clang__)
 533_INTRINSATTR
 534static __inline uint32x4_t
 535vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
 536{
 537#ifdef __aarch64__
 538 return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
 539#else
 540 return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
 541 (int32x4_t)__vsh, __bits);
 542#endif
 543}
 544#elif defined(__clang__)
 545#ifdef __LITTLE_ENDIAN__
 546#define vsriq_n_u32(__vins, __vsh, __bits) \
 547 (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \
 548 (int32x4_t)(__vsh), (__bits), 34)
 549#else
 550#define vsliq_n_s32(__vins, __vsh, __bits) ( \
 551{ \
 552 int32x4_t __tvins = (__vins); \
 553 int32x4_t __tvsh = (__vsh); \
 554 uint8_t __tbits = (__bits); \
 555 int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
 556 3,2,1,0); \
 557 int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
 558 3,2,1,0); \
 559 int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \
 560 34); \
 561 __builtin_shufflevector(__r, __r, 3,2,1,0); \
 562})
 563#endif
 564#endif
 565
532_INTRINSATTR 566_INTRINSATTR
533static __inline void 567static __inline void
534vst1q_u32(uint32_t *__p32, uint32x4_t __v) 568vst1q_u32(uint32_t *__p32, uint32x4_t __v)
535{ 569{
536#if defined(__GNUC__) && !defined(__clang__) 570#if defined(__GNUC__) && !defined(__clang__)
537#ifdef __aarch64__ 571#ifdef __aarch64__
538 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32; 572 __builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
539 573
540 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v); 574 __builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
541#else 575#else
542 __builtin_neon_si *__p = (__builtin_neon_si *)__p32; 576 __builtin_neon_si *__p = (__builtin_neon_si *)__p32;
543 577
544 __builtin_neon_vst1v4si(__p, (int32x4_t)__v); 578 __builtin_neon_vst1v4si(__p, (int32x4_t)__v);

cvs diff -r1.4 -r1.5 src/sys/crypto/chacha/arch/arm/chacha_neon.c (expand / switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/27 20:58:06 1.4
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/27 20:58:56 1.5
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $ */ 1/* $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -26,27 +26,35 @@ @@ -26,27 +26,35 @@
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/types.h> 29#include <sys/types.h>
30#include <sys/endian.h> 30#include <sys/endian.h>
31 31
32#include "arm_neon.h" 32#include "arm_neon.h"
33#include "chacha_neon.h" 33#include "chacha_neon.h"
34 34
35static inline uint32x4_t 35static inline uint32x4_t
36vrolq_n_u32(uint32x4_t x, uint8_t n) 36vrolq_n_u32(uint32x4_t x, uint8_t n)
37{ 37{
38 38
 39 /*
 40 * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
 41 * practice it hurts performance at least on Cortex-A8.
 42 */
 43#if 1
39 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n); 44 return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
 45#else
 46 return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
 47#endif
40} 48}
41 49
42static inline uint32x4_t 50static inline uint32x4_t
43vhtole_u32(uint32x4_t x) 51vhtole_u32(uint32x4_t x)
44{ 52{
45#if _BYTE_ORDER == _LITTLE_ENDIAN 53#if _BYTE_ORDER == _LITTLE_ENDIAN
46 return x; 54 return x;
47#elif _BYTE_ORDER == _BIG_ENDIAN 55#elif _BYTE_ORDER == _BIG_ENDIAN
48 return vrev32q_u8(x); 56 return vrev32q_u8(x);
49#endif 57#endif
50} 58}
51 59
52static inline uint32x4_t 60static inline uint32x4_t