Tue Jul 28 20:08:48 2020 UTC ()
Implement 4-way vectorization of ChaCha for armv7 NEON.

cgd performance is not as good as I was hoping (~4% improvement over
chacha_ref.c) but it should improve substantially more if we let the
cgd worker thread keep fpu state so we don't have to pay the cost of
isb and zero-the-fpu on every 512-byte cgd block.


(riastradh)
diff -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon.h
diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm
diff -r0 -r1.1 src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
diff -r1.2 -r1.3 src/tests/sys/crypto/chacha/Makefile

cvs diff -r1.6 -r1.7 src/sys/crypto/chacha/arch/arm/chacha_neon.c (expand / switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/28 20:05:33 1.6
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.c 2020/07/28 20:08:48 1.7
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: chacha_neon.c,v 1.6 2020/07/28 20:05:33 riastradh Exp $ */ 1/* $NetBSD: chacha_neon.c,v 1.7 2020/07/28 20:08:48 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -211,30 +211,28 @@ hchacha_neon(uint8_t out[restrict static @@ -211,30 +211,28 @@ hchacha_neon(uint8_t out[restrict static
211 211
212 vst1q_u32((uint32_t *)out + 0, r0); 212 vst1q_u32((uint32_t *)out + 0, r0);
213 vst1q_u32((uint32_t *)out + 4, r3); 213 vst1q_u32((uint32_t *)out + 4, r3);
214} 214}
215  215
216void 216void
217chacha_stream_neon(uint8_t *restrict s, size_t n, 217chacha_stream_neon(uint8_t *restrict s, size_t n,
218 uint32_t blkno, 218 uint32_t blkno,
219 const uint8_t nonce[static 12], 219 const uint8_t nonce[static 12],
220 const uint8_t k[static 32], 220 const uint8_t k[static 32],
221 unsigned nr) 221 unsigned nr)
222{ 222{
223 223
224#ifdef __aarch64__ 
225 for (; n >= 256; s += 256, n -= 256, blkno += 4) 224 for (; n >= 256; s += 256, n -= 256, blkno += 4)
226 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr); 225 chacha_stream256_neon(s, blkno, nonce, k, chacha_const32, nr);
227#endif 
228 226
229 if (n) { 227 if (n) {
230 const uint32x4_t blkno_inc = {1,0,0,0}; 228 const uint32x4_t blkno_inc = {1,0,0,0};
231 uint32x4_t in0, in1, in2, in3; 229 uint32x4_t in0, in1, in2, in3;
232 uint32x4_t r0, r1, r2, r3; 230 uint32x4_t r0, r1, r2, r3;
233 231
234 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); 232 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
235 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 233 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
236 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 234 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
237 in3 = (uint32x4_t) { 235 in3 = (uint32x4_t) {
238 blkno, 236 blkno,
239 le32dec(nonce), 237 le32dec(nonce),
240 le32dec(nonce + 4), 238 le32dec(nonce + 4),
@@ -271,31 +269,29 @@ chacha_stream_neon(uint8_t *restrict s,  @@ -271,31 +269,29 @@ chacha_stream_neon(uint8_t *restrict s,
271 in3 = vaddq_u32(in3, blkno_inc); 269 in3 = vaddq_u32(in3, blkno_inc);
272 } 270 }
273 } 271 }
274} 272}
275  273
276void 274void
277chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n, 275chacha_stream_xor_neon(uint8_t *s, const uint8_t *p, size_t n,
278 uint32_t blkno, 276 uint32_t blkno,
279 const uint8_t nonce[static 12], 277 const uint8_t nonce[static 12],
280 const uint8_t k[static 32], 278 const uint8_t k[static 32],
281 unsigned nr) 279 unsigned nr)
282{ 280{
283 281
284#ifdef __aarch64__ 
285 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) 282 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4)
286 chacha_stream_xor256_neon(s, p, blkno, nonce, k, 283 chacha_stream_xor256_neon(s, p, blkno, nonce, k,
287 chacha_const32, nr); 284 chacha_const32, nr);
288#endif 
289 285
290 if (n) { 286 if (n) {
291 const uint32x4_t blkno_inc = {1,0,0,0}; 287 const uint32x4_t blkno_inc = {1,0,0,0};
292 uint32x4_t in0, in1, in2, in3; 288 uint32x4_t in0, in1, in2, in3;
293 uint32x4_t r0, r1, r2, r3; 289 uint32x4_t r0, r1, r2, r3;
294 290
295 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32)); 291 in0 = vletoh_u32(vld1q_u32((const uint32_t *)chacha_const32));
296 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k)); 292 in1 = vletoh_u32(vld1q_u32((const uint32_t *)k));
297 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4)); 293 in2 = vletoh_u32(vld1q_u32((const uint32_t *)k + 4));
298 in3 = (uint32x4_t) { 294 in3 = (uint32x4_t) {
299 blkno, 295 blkno,
300 le32dec(nonce), 296 le32dec(nonce),
301 le32dec(nonce + 4), 297 le32dec(nonce + 4),

cvs diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/chacha_neon.h (expand / switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/chacha_neon.h 2020/07/27 20:51:29 1.2
+++ src/sys/crypto/chacha/arch/arm/chacha_neon.h 2020/07/28 20:08:48 1.3
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: chacha_neon.h,v 1.2 2020/07/27 20:51:29 riastradh Exp $ */ 1/* $NetBSD: chacha_neon.h,v 1.3 2020/07/28 20:08:48 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -54,32 +54,30 @@ void chacha_stream_xor_neon(uint8_t *, c @@ -54,32 +54,30 @@ void chacha_stream_xor_neon(uint8_t *, c
54 const uint8_t[static 32], 54 const uint8_t[static 32],
55 unsigned); 55 unsigned);
56void xchacha_stream_neon(uint8_t *restrict, size_t, 56void xchacha_stream_neon(uint8_t *restrict, size_t,
57 uint32_t, 57 uint32_t,
58 const uint8_t[static 24], 58 const uint8_t[static 24],
59 const uint8_t[static 32], 59 const uint8_t[static 32],
60 unsigned); 60 unsigned);
61void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t, 61void xchacha_stream_xor_neon(uint8_t *, const uint8_t *, size_t,
62 uint32_t, 62 uint32_t,
63 const uint8_t[static 24], 63 const uint8_t[static 24],
64 const uint8_t[static 32], 64 const uint8_t[static 32],
65 unsigned); 65 unsigned);
66 66
67#ifdef __aarch64__ 67/* Assembly helpers */
68/* Assembly helpers -- aarch64 only for now */ 
69void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t, 68void chacha_stream256_neon(uint8_t[restrict static 256], uint32_t,
70 const uint8_t[static 12], 69 const uint8_t[static 12],
71 const uint8_t[static 32], 70 const uint8_t[static 32],
72 const uint8_t[static 16], 71 const uint8_t[static 16],
73 unsigned); 72 unsigned);
74void chacha_stream_xor256_neon(uint8_t[restrict static 256], 73void chacha_stream_xor256_neon(uint8_t[restrict static 256],
75 const uint8_t[static 256], 74 const uint8_t[static 256],
76 uint32_t, 75 uint32_t,
77 const uint8_t[static 12], 76 const uint8_t[static 12],
78 const uint8_t[static 32], 77 const uint8_t[static 32],
79 const uint8_t[static 16], 78 const uint8_t[static 16],
80 unsigned); 79 unsigned);
81#endif /* __aarch64__ */ 
82 80
83extern const struct chacha_impl chacha_neon_impl; 81extern const struct chacha_impl chacha_neon_impl;
84 82
85#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */ 83#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_CHACHA_NEON_H */

cvs diff -r1.2 -r1.3 src/sys/crypto/chacha/arch/arm/files.chacha_arm (expand / switch to unified diff)

--- src/sys/crypto/chacha/arch/arm/files.chacha_arm 2020/07/27 20:51:29 1.2
+++ src/sys/crypto/chacha/arch/arm/files.chacha_arm 2020/07/28 20:08:48 1.3
@@ -1,11 +1,12 @@ @@ -1,11 +1,12 @@
1# $NetBSD: files.chacha_arm,v 1.2 2020/07/27 20:51:29 riastradh Exp $ 1# $NetBSD: files.chacha_arm,v 1.3 2020/07/28 20:08:48 riastradh Exp $
2 2
3ifdef aarch64 3ifdef aarch64
4makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a" 4makeoptions chacha "COPTS.chacha_neon.c"+="-march=armv8-a"
5else 5else
6makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon" 6makeoptions aes "COPTS.chacha_neon.c"+="-mfloat-abi=softfp -mfpu=neon"
7endif 7endif
8 8
9file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64) 9file crypto/chacha/arch/arm/chacha_neon.c chacha & (cpu_cortex | aarch64)
 10file crypto/chacha/arch/arm/chacha_neon_32.S chacha & cpu_cortex & !aarch64
10file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64 11file crypto/chacha/arch/arm/chacha_neon_64.S chacha & aarch64
11file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64) 12file crypto/chacha/arch/arm/chacha_neon_impl.c chacha & (cpu_cortex | aarch64)

File Added: src/sys/crypto/chacha/arch/arm/chacha_neon_32.S
/*	$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $	*/

/*-
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <machine/asm.h>

RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")

	.fpu	neon

/*
 * ChaCha round, split up so we can interleave the quarterrounds on
 * independent rows/diagonals to maximize pipeline efficiency, with
 * spills to deal with the scarcity of registers.  Reference:
 *
 *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
 *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
 *	https://cr.yp.to/papers.html#chacha
 *
 *	a += b; d ^= a; d <<<= 16;
 *	c += d; b ^= c; b <<<= 12;
 *	a += b; d ^= a; d <<<= 8;
 *	c += d; b ^= c; b <<<= 7;
 *
 * The rotations are implemented with:
 *	<<< 16		VREV32.16 for 16,
 *	<<< 12		VSHL/VSRI/VORR (shift left, shift right and insert, OR)
 *	<<< 8		TBL (general permutation; rot8 below stored in r)
 *	<<< 7		VSHL/VSRI/VORR
 */

.macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
	vld1.32		{\c2-\c3}, [fp, :256]
.endm

.macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
	/* a += b; d ^= a; d <<<= 16 */
	vadd.u32	\a0, \a0, \b0
	vadd.u32	\a1, \a1, \b1
	vadd.u32	\a2, \a2, \b2
	vadd.u32	\a3, \a3, \b3

	veor		\d0, \d0, \a0
	veor		\d1, \d1, \a1
	veor		\d2, \d2, \a2
	veor		\d3, \d3, \a3

	vrev32.16	\d0, \d0
	vrev32.16	\d1, \d1
	vrev32.16	\d2, \d2
	vrev32.16	\d3, \d3

	/* c += d; b ^= c; b <<<= 12 */
	vadd.u32	\c0, \c0, \d0
	vadd.u32	\c1, \c1, \d1
	vadd.u32	\c2, \c2, \d2
	vadd.u32	\c3, \c3, \d3

	vst1.32		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */

	veor		\c0, \b0, \c0
	veor		\c1, \b1, \c1
	vshl.u32	\b0, \c0, #12
	vshl.u32	\b1, \c1, #12
	vsri.u32	\b0, \c0, #(32 - 12)
	vsri.u32	\b1, \c1, #(32 - 12)

	veor		\c0, \b2, \c2
	veor		\c1, \b3, \c3
	vshl.u32	\b2, \c0, #12
	vshl.u32	\b3, \c1, #12
	vsri.u32	\b2, \c0, #(32 - 12)
	vsri.u32	\b3, \c1, #(32 - 12)

	vld1.8		{\c0l}, [r7, :64]	/* load rot8 table */

	/* a += b; d ^= a; d <<<= 8 */
	vadd.u32	\a0, \a0, \b0
	vadd.u32	\a1, \a1, \b1
	vadd.u32	\a2, \a2, \b2
	vadd.u32	\a3, \a3, \b3

	veor		\d0, \d0, \a0
	veor		\d1, \d1, \a1
	veor		\d2, \d2, \a2
	veor		\d3, \d3, \a3

	vtbl.8		\d0l, {\d0l}, \c0l	/* <<< 8 */
	vtbl.8		\d0h, {\d0h}, \c0l
	vtbl.8		\d1l, {\d1l}, \c0l
	vtbl.8		\d1h, {\d1h}, \c0l
	vtbl.8		\d2l, {\d2l}, \c0l
	vtbl.8		\d2h, {\d2h}, \c0l
	vtbl.8		\d3l, {\d3l}, \c0l
	vtbl.8		\d3h, {\d3h}, \c0l

	vld1.32		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */

	/* c += d; b ^= c; b <<<= 7 */
	vadd.u32	\c2, \c2, \d2
	vadd.u32	\c3, \c3, \d3
	vadd.u32	\c0, \c0, \d0
	vadd.u32	\c1, \c1, \d1

	vst1.32		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */

	veor		\c2, \b2, \c2
	veor		\c3, \b3, \c3
	vshl.u32	\b2, \c2, #7
	vshl.u32	\b3, \c3, #7
	vsri.u32	\b2, \c2, #(32 - 7)
	vsri.u32	\b3, \c3, #(32 - 7)

	veor		\c2, \b0, \c0
	veor		\c3, \b1, \c1
	vshl.u32	\b0, \c2, #7
	vshl.u32	\b1, \c3, #7
	vsri.u32	\b0, \c2, #(32 - 7)
	vsri.u32	\b1, \c3, #(32 - 7)
.endm

#if _BYTE_ORDER == _LITTLE_ENDIAN
#define	HTOLE32(x)
#define	LE32TOH(x)
#elif _BYTE_ORDER == _BIG_ENDIAN
#define	HTOLE32(x)	vrev32.8	x, x
#define	LE32TOH(x)	vrev32.8	x, x
#endif

	.text
	.p2align 2
.Lconstants_addr:
	.long	.Lconstants - .

/*
 * chacha_stream256_neon(uint8_t s[256]@r0,
 *     uint32_t blkno@r1,
 *     const uint8_t nonce[12]@r2,
 *     const uint8_t key[32]@r3,
 *     const uint8_t const[16]@sp[0],
 *     unsigned nr@sp[4])
 */
ENTRY(chacha_stream256_neon)
	/* save callee-saves registers */
	push	{r4, r5, r6, r7, r8, r10, fp, lr}
	vpush	{d8-d15}

	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
	ldr	r7, .Lconstants_addr
	adr	r6, .Lconstants_addr

	/* reserve space for two 128-bit/16-byte q registers */
	sub	fp, sp, #0x20
	bic	fp, fp, #0x1f	/* align */

	/* get parameters */
	add	ip, sp, #96
	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */

	vld1.32	{q12}, [r4]	/* q12 := constant */
	vld1.32	{q13-q14}, [r3]	/* q13-q14 := key */
	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */

	vdup.32	q0, d24[0]	/* q0-q3 := constant */
	vdup.32	q1, d24[1]
	vdup.32	q2, d25[0]
	vdup.32	q3, d25[1]
	vdup.32	q12, r1		/* q12 := (blkno, blkno, blkno, blkno) */
	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
	vdup.32	q5, d26[1]
	vdup.32	q6, d27[0]
	vdup.32	q7, d27[1]
	vdup.32	q8, d28[0]
	vdup.32	q9, d28[1]
	vdup.32	q10, d29[0]
	vdup.32	q11, d29[1]
	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
	vdup.32	q13, r6		/* q13-q15 := nonce */
	vdup.32	q14, r8
	vdup.32	q15, r10

	HTOLE32(q0)
	HTOLE32(q1)
	HTOLE32(q2)
	HTOLE32(q3)
	HTOLE32(q4)
	HTOLE32(q5)
	HTOLE32(q6)
	HTOLE32(q7)
	HTOLE32(q8)
	HTOLE32(q9)
	HTOLE32(q10)
	HTOLE32(q11)
	HTOLE32(q12)
	HTOLE32(q13)
	HTOLE32(q14)
	HTOLE32(q15)

	b	2f

	_ALIGN_TEXT
1:	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
2:	subs	r5, r5, #2
	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
			d16, d24,d25, d26,d27, d28,d29, d30,d31
	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
			d20, d30,d31, d24,d25, d26,d27, d28,d29
	bne	1b

	/*
	 * q8-q9 are free / saved on the stack.  We have:
	 *
	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
	 *	...
	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
	 *
	 * where xi[j] is the jth word of the ith 16-word block.  Zip
	 * consecutive pairs with vzip.32, and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
	 *	...
	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
	 *
	 * As 64-bit d registers, this is:
	 *
	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
	 *	...
	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
	 *
	 * Swap d1<->d4, d3<->d6, ..., and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	...
	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
	 */

	sub	r7, r7, #0x10
	vdup.32	q8, r1		/* q8 := (blkno, blkno, blkno, blkno) */
	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */

	vzip.32	q0, q1
	vzip.32	q2, q3
	vzip.32	q4, q5
	vzip.32	q6, q7

	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q9}, [r4]	/* q9 := constant */
	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q8}, [r3]!	/* q8 := key[0:16) */

	vswp	d1, d4
	vswp	d9, d12
	vswp	d3, d6
	vswp	d11, d14

	/*
	 * At this point, the blocks are:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
	 *
	 * The first two rows to write out are q0 = x0[0:4) and q4 =
	 * x0[4:8).  If we first swap q1 and q4, then once we've
	 * written them out we free up consecutive registers q0-q1 for
	 * store-multiple.
	 */

	vswp	q1, q4

	vadd.u32 q0, q0, q9
	vadd.u32 q4, q4, q9
	vadd.u32 q2, q2, q9
	vadd.u32 q3, q3, q9

	vadd.u32 q1, q1, q8
	vadd.u32 q5, q5, q8
	vadd.u32 q6, q6, q8
	vadd.u32 q7, q7, q8

	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */

	LE32TOH(q0)
	LE32TOH(q1)
	LE32TOH(q2)
	LE32TOH(q3)
	LE32TOH(q4)
	LE32TOH(q5)
	LE32TOH(q6)
	LE32TOH(q7)

	vst1.32	{q0-q1}, [r0]!
	vld1.32	{q0}, [r3]	/* q0 := key[16:32) */
	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
	vmov	d2, r3, r6
	vmov	d3, r8, r10

	vzip.32	q8, q9
	vzip.32	q10, q11
	vzip.32	q12, q13
	vzip.32	q14, q15

	vswp	d17, d20
	vswp	d25, d28
	vswp	d19, d22
	vswp	d27, d30

	vadd.u32 q8, q8, q0
	vadd.u32 q9, q9, q0
	vadd.u32 q10, q10, q0
	vadd.u32 q11, q11, q0

	vadd.u32 q12, q12, q1
	vadd.u32 q13, q13, q1
	vadd.u32 q14, q14, q1
	vadd.u32 q15, q15, q1

	LE32TOH(q8)
	LE32TOH(q9)
	LE32TOH(q10)
	LE32TOH(q11)
	LE32TOH(q12)
	LE32TOH(q13)
	LE32TOH(q14)
	LE32TOH(q15)

	/* prepare to zero temporary space on stack */
	vmov.i32 q0, #0
	vmov.i32 q1, #0

	/* vst1.32	{q0}, [r0]! */
	/* vst1.32	{q1}, [r0]! */	/* (was q4 before vswp) */
	vst1.32	{q8}, [r0]!
	vst1.32	{q12}, [r0]!
	vst1.32	{q2}, [r0]!
	vst1.32	{q6}, [r0]!
	vst1.32	{q10}, [r0]!
	vst1.32	{q14}, [r0]!
	vst1.32	{q4}, [r0]!	/* (was q1 before vswp) */
	vst1.32	{q5}, [r0]!
	vst1.32	{q9}, [r0]!
	vst1.32	{q13}, [r0]!
	vst1.32 {q3}, [r0]!
	vst1.32 {q7}, [r0]!
	vst1.32 {q11}, [r0]!
	vst1.32 {q15}, [r0]

	/* zero temporary space on the stack */
	vst1.8	{q0-q1}, [fp, :256]

	/* restore callee-saves registers and stack */
	vpop	{d8-d15}
	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
	bx	lr
END(chacha_stream256_neon)

/*
 * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
 *     uint32_t blkno@r2,
 *     const uint8_t nonce[12]@r3,
 *     const uint8_t key[32]@sp[0],
 *     const uint8_t const[16]@sp[4],
 *     unsigned nr@sp[8])
 */
ENTRY(chacha_stream_xor256_neon)
	/* save callee-saves registers */
	push	{r4, r5, r6, r7, r8, r10, fp, lr}
	vpush	{d8-d15}

	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
	ldr	r7, .Lconstants_addr
	adr	r6, .Lconstants_addr

	/* reserve space for two 128-bit/16-byte q registers */
	sub	fp, sp, #0x20
	bic	fp, fp, #0x1f	/* align */

	/* get parameters */
	add	ip, sp, #96
	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */

	vld1.32	{q12}, [r5]	/* q12 := constant */
	vld1.32	{q13-q14}, [r4]	/* q13-q14 := key */
	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */

	vdup.32	q0, d24[0]	/* q0-q3 := constant */
	vdup.32	q1, d24[1]
	vdup.32	q2, d25[0]
	vdup.32	q3, d25[1]
	vdup.32	q12, r2		/* q12 := (blkno, blkno, blkno, blkno) */
	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
	vdup.32	q5, d26[1]
	vdup.32	q6, d27[0]
	vdup.32	q7, d27[1]
	vdup.32	q8, d28[0]
	vdup.32	q9, d28[1]
	vdup.32	q10, d29[0]
	vdup.32	q11, d29[1]
	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
	vdup.32	q13, r6		/* q13-q15 := nonce */
	vdup.32	q14, r8
	vdup.32	q15, r10

	HTOLE32(q0)
	HTOLE32(q1)
	HTOLE32(q2)
	HTOLE32(q3)
	HTOLE32(q4)
	HTOLE32(q5)
	HTOLE32(q6)
	HTOLE32(q7)
	HTOLE32(q8)
	HTOLE32(q9)
	HTOLE32(q10)
	HTOLE32(q11)
	HTOLE32(q12)
	HTOLE32(q13)
	HTOLE32(q14)
	HTOLE32(q15)

	b	2f

	_ALIGN_TEXT
1:	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
2:	subs	ip, ip, #2
	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
			d16, d24,d25, d26,d27, d28,d29, d30,d31
	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
			d20, d30,d31, d24,d25, d26,d27, d28,d29
	bne	1b

	/*
	 * q8-q9 are free / saved on the stack.  Now for the real fun:
	 * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
	 * {0,1,2,...,15}.  The twist is that the p[i] and the y[i] are
	 * transposed from one another, and the x[i] are in general
	 * registers and memory.  So we have:
	 *
	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
	 *	...
	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
	 *
	 * where xi[j] is the jth word of the ith 16-word block.  Zip
	 * consecutive pairs with vzip.32, and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
	 *	...
	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
	 *
	 * As 64-bit d registers, this is:
	 *
	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
	 *	...
	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
	 *
	 * Swap d1<->d4, d3<->d6, ..., and you get:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	...
	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
	 */

	sub	r7, r7, #0x10
	vdup.32	q8, r2		/* q8 := (blkno, blkno, blkno, blkno) */
	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */

	vzip.32	q0, q1
	vzip.32	q2, q3
	vzip.32	q4, q5
	vzip.32	q6, q7

	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q9}, [r5]	/* q9 := constant */
	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
	vld1.32	{q8}, [r4]!	/* q8 := key[0:16) */

	vswp	d1, d4
	vswp	d9, d12
	vswp	d3, d6
	vswp	d11, d14

	/*
	 * At this point, the blocks are:
	 *
	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
	 *
	 * The first two rows to write out are q0 = x0[0:4) and q4 =
	 * x0[4:8).  If we first swap q1 and q4, then once we've
	 * written them out we free up consecutive registers q0-q1 for
	 * store-multiple.
	 */

	vswp	q1, q4

	vadd.u32 q0, q0, q9
	vadd.u32 q4, q4, q9
	vadd.u32 q2, q2, q9
	vadd.u32 q3, q3, q9

	vadd.u32 q1, q1, q8
	vadd.u32 q5, q5, q8
	vadd.u32 q6, q6, q8
	vadd.u32 q7, q7, q8

	vld1.32 {q8-q9}, [r1]!	/* load plaintext bytes [0:32) */

	LE32TOH(q0)
	LE32TOH(q1)
	LE32TOH(q2)
	LE32TOH(q6)
	LE32TOH(q4)
	LE32TOH(q5)
	LE32TOH(q3)
	LE32TOH(q7)

	veor	q0, q0, q8	/* compute ciphertext bytes [0:32) */
	veor	q1, q1, q9

	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */

	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
	vld1.32	{q0}, [r4]	/* q0 := key[16:32) */
	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
	vmov	d2, r3, r6
	vmov	d3, r8, r10

	vzip.32	q8, q9
	vzip.32	q10, q11
	vzip.32	q12, q13
	vzip.32	q14, q15

	vswp	d17, d20
	vswp	d25, d28
	vswp	d19, d22
	vswp	d27, d30

	vswp	q9, q12		/* free up q9 earlier for consecutive q8-q9 */

	vadd.u32 q8, q8, q0
	vadd.u32 q12, q12, q0
	vadd.u32 q10, q10, q0
	vadd.u32 q11, q11, q0

	vadd.u32 q9, q9, q1
	vadd.u32 q13, q13, q1
	vadd.u32 q14, q14, q1
	vadd.u32 q15, q15, q1

	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */

	LE32TOH(q8)
	LE32TOH(q9)
	LE32TOH(q10)
	LE32TOH(q14)
	LE32TOH(q12)
	LE32TOH(q13)
	LE32TOH(q11)
	LE32TOH(q15)

	veor	q0, q0, q8	/* compute ciphertext bytes [32:64) */
	veor	q1, q1, q9

	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */

	veor	q2, q2, q8	/* compute ciphertext bytes [64:96) */
	veor	q6, q6, q9

	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
	vst1.32	{q2}, [r0]!	/* store ciphertext bytes [64:80) */

	veor	q10, q10, q0	/* compute ciphertext bytes [96:128) */
	veor	q14, q14, q1

	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
	vst1.32	{q6}, [r0]!	/* store ciphertext bytes [80:96) */

	veor	q4, q4, q8	/* compute ciphertext bytes [128:160) */
	veor	q5, q5, q9

	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
	vst1.32	{q10}, [r0]!	/* store ciphertext bytes [96:112) */

	veor	q12, q12, q0	/* compute ciphertext bytes [160:192) */
	veor	q13, q13, q1

	vld1.32	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
	vst1.32	{q14}, [r0]!	/* store ciphertext bytes [112:128) */

	veor	q8, q3, q8	/* compute ciphertext bytes [192:224) */
	veor	q9, q7, q9

	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [128:160) */
	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [160:192) */

	veor	q0, q11, q0	/* compute ciphertext bytes [224:256) */
	veor	q1, q15, q1

	vst1.32	{q8-q9}, [r0]!	/* store ciphertext bytes [192:224) */
	vst1.32	{q0-q1}, [r0]	/* store ciphertext bytes [224:256) */

	/* zero temporary space on the stack */
	vmov.i32 q0, #0
	vmov.i32 q1, #0
	vst1.8	{q0-q1}, [fp, :256]

	/* restore callee-saves registers and stack */
	vpop	{d8-d15}
	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
	bx	lr
END(chacha_stream_xor256_neon)

	.section .rodata
	.p2align 4
.Lconstants:

	.type	v0123,%object
v0123:
	.long	0, 1, 2, 3
END(v0123)

	.type	rot8,%object
rot8:
	.long	0x02010003, 0x06050407
END(rot8)

cvs diff -r1.2 -r1.3 src/tests/sys/crypto/chacha/Makefile (expand / switch to unified diff)

--- src/tests/sys/crypto/chacha/Makefile 2020/07/27 20:51:29 1.2
+++ src/tests/sys/crypto/chacha/Makefile 2020/07/28 20:08:48 1.3
@@ -1,38 +1,40 @@ @@ -1,38 +1,40 @@
1# $NetBSD: Makefile,v 1.2 2020/07/27 20:51:29 riastradh Exp $ 1# $NetBSD: Makefile,v 1.3 2020/07/28 20:08:48 riastradh Exp $
2 2
3.include <bsd.own.mk> 3.include <bsd.own.mk>
4 4
5TESTSDIR= ${TESTSBASE}/sys/crypto/chacha 5TESTSDIR= ${TESTSBASE}/sys/crypto/chacha
6 6
7TESTS_C= t_chacha 7TESTS_C= t_chacha
8 8
9AFLAGS+= -D_LOCORE 9AFLAGS+= -D_LOCORE
10 10
11.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha 11.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha
12CPPFLAGS+= -I${NETBSDSRCDIR}/sys 12CPPFLAGS+= -I${NETBSDSRCDIR}/sys
13 13
14SRCS.t_chacha+= t_chacha.c 14SRCS.t_chacha+= t_chacha.c
15 15
16SRCS.t_chacha+= chacha_ref.c 16SRCS.t_chacha+= chacha_ref.c
17SRCS.t_chacha+= chacha_selftest.c 17SRCS.t_chacha+= chacha_selftest.c
18 18
19.if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*) 19.if !empty(MACHINE_ARCH:Mearmv7*) || !empty(MACHINE_ARCH:Maarch64*)
20 20
21.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm 21.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm
22CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm 22CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/arm
23 23
24SRCS.t_chacha+= chacha_neon.c 24SRCS.t_chacha+= chacha_neon.c
25.if !empty(MACHINE_ARCH:Maarch64*) 25.if !empty(MACHINE_ARCH:Mearmv7*)
 26SRCS.t_chacha+= chacha_neon_32.S
 27.elif !empty(MACHINE_ARCH:Maarch64*)
26SRCS.t_chacha+= chacha_neon_64.S 28SRCS.t_chacha+= chacha_neon_64.S
27.endif 29.endif
28SRCS.t_chacha+= chacha_neon_impl.c 30SRCS.t_chacha+= chacha_neon_impl.c
29 31
30.endif # earmv7 or aarch64 32.endif # earmv7 or aarch64
31 33
32.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64" 34.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64"
33 35
34.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 36.PATH: ${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86
35CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86 37CPPFLAGS+= -I${NETBSDSRCDIR}/sys/crypto/chacha/arch/x86
36 38
37SRCS.t_chacha+= chacha_sse2.c 39SRCS.t_chacha+= chacha_sse2.c
38SRCS.t_chacha+= chacha_sse2_impl.c 40SRCS.t_chacha+= chacha_sse2_impl.c