@@ -1,692 +1,641 @@
-/*	$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $	*/
+/*	$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $	*/
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 #include <machine/asm.h>
-RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $")
+RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $")
 	.fpu	neon
 /*
  * ChaCha round, split up so we can interleave the quarterrounds on
  * independent rows/diagonals to maximize pipeline efficiency, with
  * spills to deal with the scarcity of registers.  Reference:
+ *
  *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
  *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
  *	https://cr.yp.to/papers.html#chacha
+ *
  *	a += b; d ^= a; d <<<= 16;
  *	c += d; b ^= c; b <<<= 12;
  *	a += b; d ^= a; d <<<= 8;
  *	c += d; b ^= c; b <<<= 7;
+ *
  * The rotations are implemented with:
  *	<<< 16		VREV32.16 for 16,
  *	<<< 12		VSHL/VSRI/VORR (shift left, shift right and insert, OR)
  *	<<< 8		TBL (general permutation; rot8 below stored in r)
  *	<<< 7		VSHL/VSRI/VORR
  */
 .macro	ROUNDLD	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3
 	vld1.32		{\c2-\c3}, [fp, :256]
 .endm
 .macro	ROUND	a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3, d0,d1,d2,d3, c0l, d0l,d0h,d1l,d1h,d2l,d2h,d3l,d3h
 	/* a += b; d ^= a; d <<<= 16 */
 	vadd.u32	\a0, \a0, \b0
 	vadd.u32	\a1, \a1, \b1
 	vadd.u32	\a2, \a2, \b2
 	vadd.u32	\a3, \a3, \b3
 	veor		\d0, \d0, \a0
 	veor		\d1, \d1, \a1
 	veor		\d2, \d2, \a2
 	veor		\d3, \d3, \a3
 	vrev32.16	\d0, \d0
 	vrev32.16	\d1, \d1
 	vrev32.16	\d2, \d2
 	vrev32.16	\d3, \d3
 	/* c += d; b ^= c; b <<<= 12 */
 	vadd.u32	\c0, \c0, \d0
 	vadd.u32	\c1, \c1, \d1
 	vadd.u32	\c2, \c2, \d2
 	vadd.u32	\c3, \c3, \d3
 	vst1.32		{\c0-\c1}, [fp, :256]	/* free c0 and c1 as temps */
 	veor		\c0, \b0, \c0
 	veor		\c1, \b1, \c1
 	vshl.u32	\b0, \c0, #12
 	vshl.u32	\b1, \c1, #12
 	vsri.u32	\b0, \c0, #(32 - 12)
 	vsri.u32	\b1, \c1, #(32 - 12)
 	veor		\c0, \b2, \c2
 	veor		\c1, \b3, \c3
 	vshl.u32	\b2, \c0, #12
 	vshl.u32	\b3, \c1, #12
 	vsri.u32	\b2, \c0, #(32 - 12)
 	vsri.u32	\b3, \c1, #(32 - 12)
 	vld1.8		{\c0l}, [r7, :64]	/* load rot8 table */
 	/* a += b; d ^= a; d <<<= 8 */
 	vadd.u32	\a0, \a0, \b0
 	vadd.u32	\a1, \a1, \b1
 	vadd.u32	\a2, \a2, \b2
 	vadd.u32	\a3, \a3, \b3
 	veor		\d0, \d0, \a0
 	veor		\d1, \d1, \a1
 	veor		\d2, \d2, \a2
 	veor		\d3, \d3, \a3
 	vtbl.8		\d0l, {\d0l}, \c0l	/* <<< 8 */
 	vtbl.8		\d0h, {\d0h}, \c0l
 	vtbl.8		\d1l, {\d1l}, \c0l
 	vtbl.8		\d1h, {\d1h}, \c0l
 	vtbl.8		\d2l, {\d2l}, \c0l
 	vtbl.8		\d2h, {\d2h}, \c0l
 	vtbl.8		\d3l, {\d3l}, \c0l
 	vtbl.8		\d3h, {\d3h}, \c0l
 	vld1.32		{\c0-\c1}, [fp, :256]	/* restore c0 and c1 */
 	/* c += d; b ^= c; b <<<= 7 */
 	vadd.u32	\c2, \c2, \d2
 	vadd.u32	\c3, \c3, \d3
 	vadd.u32	\c0, \c0, \d0
 	vadd.u32	\c1, \c1, \d1
 	vst1.32		{\c2-\c3}, [fp, :256]	/* free c2 and c3 as temps */
 	veor		\c2, \b2, \c2
 	veor		\c3, \b3, \c3
 	vshl.u32	\b2, \c2, #7
 	vshl.u32	\b3, \c3, #7
 	vsri.u32	\b2, \c2, #(32 - 7)
 	vsri.u32	\b3, \c3, #(32 - 7)
 	veor		\c2, \b0, \c0
 	veor		\c3, \b1, \c1
 	vshl.u32	\b0, \c2, #7
 	vshl.u32	\b1, \c3, #7
 	vsri.u32	\b0, \c2, #(32 - 7)
 	vsri.u32	\b1, \c3, #(32 - 7)
 .endm
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 #define	HTOLE32(x)
 #define	LE32TOH(x)
 #elif _BYTE_ORDER == _BIG_ENDIAN
 #define	HTOLE32(x)	vrev32.8	x, x
 #define	LE32TOH(x)	vrev32.8	x, x
 #endif
 	.text
 	.p2align 2
 .Lconstants_addr:
 	.long	.Lconstants - .
 /*
  * chacha_stream256_neon(uint8_t s[256]@r0,
  *     uint32_t blkno@r1,
  *     const uint8_t nonce[12]@r2,
  *     const uint8_t key[32]@r3,
  *     const uint8_t const[16]@sp[0],
  *     unsigned nr@sp[4])
  */
 ENTRY(chacha_stream256_neon)
 	/* save callee-saves registers */
 	push	{r4, r5, r6, r7, r8, r10, fp, lr}
 	vpush	{d8-d15}
 	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
 	ldr	r7, .Lconstants_addr
 	adr	r6, .Lconstants_addr
 	/* reserve space for two 128-bit/16-byte q registers */
 	sub	fp, sp, #0x20
 	bic	fp, fp, #0x1f	/* align */
 	/* get parameters */
 	add	ip, sp, #96
 	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
 	ldm	ip, {r4, r5}	/* r4 := const, r5 := nr */
 	ldm	r2, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
 	vld1.32	{q12}, [r4]	/* q12 := constant */
 	vld1.32	{q13-q14}, [r3]	/* q13-q14 := key */
 	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
 	vdup.32	q1, d24[1]
 	vdup.32	q2, d25[0]
 	vdup.32	q3, d25[1]
 	vdup.32	q12, r1		/* q12 := (blkno, blkno, blkno, blkno) */
 	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
 	vdup.32	q5, d26[1]
 	vdup.32	q6, d27[0]
 	vdup.32	q7, d27[1]
 	vdup.32	q8, d28[0]
 	vdup.32	q9, d28[1]
 	vdup.32	q10, d29[0]
 	vdup.32	q11, d29[1]
 	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
 	vdup.32	q13, r6		/* q13-q15 := nonce */
 	vdup.32	q14, r8
 	vdup.32	q15, r10
 	HTOLE32(q0)
 	HTOLE32(q1)
 	HTOLE32(q2)
 	HTOLE32(q3)
 	HTOLE32(q4)
 	HTOLE32(q5)
 	HTOLE32(q6)
 	HTOLE32(q7)
 	HTOLE32(q8)
 	HTOLE32(q9)
 	HTOLE32(q10)
 	HTOLE32(q11)
 	HTOLE32(q12)
 	HTOLE32(q13)
 	HTOLE32(q14)
 	HTOLE32(q15)
 	b	2f
 	_ALIGN_TEXT
 :	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
 :	subs	r5, r5, #2
 	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
 			d16, d24,d25, d26,d27, d28,d29, d30,d31
 	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
 	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
 			d20, d30,d31, d24,d25, d26,d27, d28,d29
 	bne	1b
 	/*
 	 * q8-q9 are free / saved on the stack.  We have:
+	 *
 	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
 	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
 	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
 	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
 	 *	...
 	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
+	 *
 	 * where xi[j] is the jth word of the ith 16-word block.  Zip
 	 * consecutive pairs with vzip.32, and you get:
+	 *
 	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
 	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
 	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
 	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
 	 *	...
 	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
+	 *
 	 * As 64-bit d registers, this is:
+	 *
 	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
 	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
 	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
 	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
 	 *	...
 	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
+	 *
 	 * Swap d1<->d4, d3<->d6, ..., and you get:
+	 *
 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
 	 *	...
 	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
 	 */
 	sub	r7, r7, #0x10
 	vdup.32	q8, r1		/* q8 := (blkno, blkno, blkno, blkno) */
 	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
 	vzip.32	q0, q1
 	vzip.32	q2, q3
 	vzip.32	q4, q5
 	vzip.32	q6, q7
 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
 	vld1.32	{q9}, [r4]	/* q9 := constant */
 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
 	vld1.32	{q8}, [r3]!	/* q8 := key[0:16) */
 	vswp	d1, d4
 	vswp	d9, d12
 	vswp	d3, d6
 	vswp	d11, d14
 	/*
 	 * At this point, the blocks are:
+	 *
 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
 	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
 	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
 	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
 	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
+	 *
 	 * The first two rows to write out are q0 = x0[0:4) and q4 =
-	 * x0[4:8).  If we first swap q1 and q4, then once we've
+	 * x0[4:8).  Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14
-	 * written them out we free up consecutive registers q0-q1 for
+	 * enables us to issue all stores in consecutive pairs:
-	 * store-multiple.
+	 *	x0 in q0-q1
 	 *	x1 in q8-q9
 	 *	x2 in q2-q3
 	 *	x3 in q10-q11
 	 *	x4 in q4-q5
 	 *	x5 in q12-q3
 	 *	x6 in q6-q7
 	 *	x7 in q14-q15
 	 */
 	vswp	q1, q4
 	vswp	q3, q6
 	vadd.u32 q0, q0, q9
 	vadd.u32 q4, q4, q9
 	vadd.u32 q2, q2, q9
-	vadd.u32 q3, q3, q9
+	vadd.u32 q6, q6, q9
 	vadd.u32 q1, q1, q8
 	vadd.u32 q5, q5, q8
-	vadd.u32 q6, q6, q8
+	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
 	LE32TOH(q0)
 	LE32TOH(q1)
 	LE32TOH(q2)
 	LE32TOH(q3)
 	LE32TOH(q4)
 	LE32TOH(q5)
 	LE32TOH(q6)
 	LE32TOH(q7)
 	vst1.32	{q0-q1}, [r0]!
 	vld1.32	{q0}, [r3]	/* q0 := key[16:32) */
 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
 	vmov	d2, r3, r6
 	vmov	d3, r8, r10
 	vzip.32	q8, q9
 	vzip.32	q10, q11
 	vzip.32	q12, q13
 	vzip.32	q14, q15
 	vswp	d17, d20
 	vswp	d25, d28
 	vswp	d19, d22
 	vswp	d27, d30
 	vswp	q9, q12
 	vswp	q11, q14
 	vadd.u32 q8, q8, q0
-	vadd.u32 q9, q9, q0
+	vadd.u32 q12, q12, q0
 	vadd.u32 q10, q10, q0
-	vadd.u32 q11, q11, q0
+	vadd.u32 q14, q14, q0
-	vadd.u32 q12, q12, q1
+	vadd.u32 q9, q9, q1
 	vadd.u32 q13, q13, q1
-	vadd.u32 q14, q14, q1
+	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 	LE32TOH(q8)
 	LE32TOH(q9)
 	LE32TOH(q10)
 	LE32TOH(q11)
 	LE32TOH(q12)
 	LE32TOH(q13)
 	LE32TOH(q14)
 	LE32TOH(q15)
-	/* prepare to zero temporary space on stack */
+	/* vst1.32	{q0-q1}, [r0]! */
-	vmov.i32 q0, #0
+	vst1.32	{q8-q9}, [r0]!
-	vmov.i32 q1, #0
+	vst1.32	{q2-q3}, [r0]!
 	vst1.32	{q10-q11}, [r0]!
-	/* vst1.32	{q0}, [r0]! */
+	vst1.32	{q4-q5}, [r0]!
-	/* vst1.32	{q1}, [r0]! */	/* (was q4 before vswp) */
+	vst1.32	{q12-q13}, [r0]!
-	vst1.32	{q8}, [r0]!
+	vst1.32 {q6-q7}, [r0]!
-	vst1.32	{q12}, [r0]!
+	vst1.32 {q14-q15}, [r0]
 	vst1.32	{q2}, [r0]!
 	vst1.32	{q6}, [r0]!
 	vst1.32	{q10}, [r0]!
 	vst1.32	{q14}, [r0]!
 	vst1.32	{q4}, [r0]!	/* (was q1 before vswp) */
 	vst1.32	{q5}, [r0]!
 	vst1.32	{q9}, [r0]!
 	vst1.32	{q13}, [r0]!
 	vst1.32 {q3}, [r0]!
 	vst1.32 {q7}, [r0]!
 	vst1.32 {q11}, [r0]!
 	vst1.32 {q15}, [r0]
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
 	vmov.i32 q1, #0
 	vst1.8	{q0-q1}, [fp, :256]
 	/* restore callee-saves registers and stack */
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
 	bx	lr
 END(chacha_stream256_neon)
 /*
  * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1,
  *     uint32_t blkno@r2,
  *     const uint8_t nonce[12]@r3,
  *     const uint8_t key[32]@sp[0],
  *     const uint8_t const[16]@sp[4],
  *     unsigned nr@sp[8])
  */
 ENTRY(chacha_stream_xor256_neon)
 	/* save callee-saves registers */
 	push	{r4, r5, r6, r7, r8, r10, fp, lr}
 	vpush	{d8-d15}
 	/* r7 := .Lconstants - .Lconstants_addr, r6 := .Lconstants_addr */
 	ldr	r7, .Lconstants_addr
 	adr	r6, .Lconstants_addr
 	/* reserve space for two 128-bit/16-byte q registers */
 	sub	fp, sp, #0x20
 	bic	fp, fp, #0x1f	/* align */
 	/* get parameters */
 	add	ip, sp, #96
 	add	r7, r7, r6	/* r7 := .Lconstants (= v0123) */
 	ldm	ip, {r4, r5, ip}	/* r4 := key, r5 := const, ip := nr */
 	ldm	r3, {r6, r8, r10}	/* (r6, r8, r10) := nonce[0:12) */
 	vld1.32	{q12}, [r5]	/* q12 := constant */
 	vld1.32	{q13-q14}, [r4]	/* q13-q14 := key */
 	vld1.32	{q15}, [r7, :128]! /* q15 := (0, 1, 2, 3) (128-bit aligned) */
 	vdup.32	q0, d24[0]	/* q0-q3 := constant */
 	vdup.32	q1, d24[1]
 	vdup.32	q2, d25[0]
 	vdup.32	q3, d25[1]
 	vdup.32	q12, r2		/* q12 := (blkno, blkno, blkno, blkno) */
 	vdup.32	q4, d26[0]	/* q4-q11 := (key, key, key, key) */
 	vdup.32	q5, d26[1]
 	vdup.32	q6, d27[0]
 	vdup.32	q7, d27[1]
 	vdup.32	q8, d28[0]
 	vdup.32	q9, d28[1]
 	vdup.32	q10, d29[0]
 	vdup.32	q11, d29[1]
 	vadd.u32 q12, q12, q15	/* q12 := (blkno,blkno+1,blkno+2,blkno+3) */
 	vdup.32	q13, r6		/* q13-q15 := nonce */
 	vdup.32	q14, r8
 	vdup.32	q15, r10
 	HTOLE32(q0)
 	HTOLE32(q1)
 	HTOLE32(q2)
 	HTOLE32(q3)
 	HTOLE32(q4)
 	HTOLE32(q5)
 	HTOLE32(q6)
 	HTOLE32(q7)
 	HTOLE32(q8)
 	HTOLE32(q9)
 	HTOLE32(q10)
 	HTOLE32(q11)
 	HTOLE32(q12)
 	HTOLE32(q13)
 	HTOLE32(q14)
 	HTOLE32(q15)
 	b	2f
 	_ALIGN_TEXT
 :	ROUNDLD	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14
 :	subs	ip, ip, #2
 	ROUND	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \
 			d16, d24,d25, d26,d27, d28,d29, d30,d31
 	ROUNDLD	q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15
 	ROUND	q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \
 			d20, d30,d31, d24,d25, d26,d27, d28,d29
 	bne	1b
 	/*
 	 * q8-q9 are free / saved on the stack.  Now for the real fun:
 	 * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in
 	 * {0,1,2,...,15}.  The twist is that the p[i] and the y[i] are
 	 * transposed from one another, and the x[i] are in general
-	 * registers and memory.  So we have:
+	 * registers and memory.  See comments in chacha_stream256_neon
 	 * for the layout with swaps.
 	 *	q0 = (x0[0], x1[0]; x2[0], x3[0])
 	 *	q1 = (x0[1], x1[1]; x2[1], x3[1])
 	 *	q2 = (x0[2], x1[2]; x2[2], x3[2])
 	 *	q3 = (x0[3], x1[3]; x2[3], x3[3])
 	 *	...
 	 *	q15 = (x0[15], x1[15]; x2[15], x3[15])
 	 * where xi[j] is the jth word of the ith 16-word block.  Zip
 	 * consecutive pairs with vzip.32, and you get:
 	 *	q0 = (x0[0], x0[1]; x1[0], x1[1])
 	 *	q1 = (x2[0], x2[1]; x3[0], x3[1])
 	 *	q2 = (x0[2], x0[3]; x1[2], x1[3])
 	 *	q3 = (x2[2], x2[3]; x3[2], x3[3])
 	 *	...
 	 *	q15 = (x2[14], x2[15]; x3[14], x3[15])
 	 * As 64-bit d registers, this is:
 	 *	d0 = (x0[0], x0[1])	d1 = (x1[0], x1[1])
 	 *	d2 = (x2[0], x2[1])	d3 = (x3[0], x3[1])
 	 *	d4 = (x0[2], x0[3])	d5 = (x1[2], x1[3])
 	 *	d6 = (x2[2], x2[3])	d7 = (x3[2], x3[3])
 	 *	...
 	 *	d30 = (x2[14], x2[15])	d31 = (x3[14], x3[15])
 	 * Swap d1<->d4, d3<->d6, ..., and you get:
 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
 	 *	...
 	 *	q15 = (x15[0], x15[1]; x15[2], x15[3])
 	 */
 	sub	r7, r7, #0x10
 	vdup.32	q8, r2		/* q8 := (blkno, blkno, blkno, blkno) */
 	vld1.32	{q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */
 	vzip.32	q0, q1
 	vzip.32	q2, q3
 	vzip.32	q4, q5
 	vzip.32	q6, q7
 	vadd.u32 q8, q8, q9	/* q8 := (blkno,blkno+1,blkno+2,blkno+3) */
 	vld1.32	{q9}, [r5]	/* q9 := constant */
 	vadd.u32 q12, q12, q8	/* q12 += (blkno,blkno+1,blkno+2,blkno+3) */
 	vld1.32	{q8}, [r4]!	/* q8 := key[0:16) */
 	vswp	d1, d4
 	vswp	d9, d12
 	vswp	d3, d6
 	vswp	d9, d12
 	vswp	d1, d4
 	vswp	d11, d14
 	/*
 	 * At this point, the blocks are:
 	 *	q0 = (x0[0], x0[1]; x0[2], x0[3])
 	 *	q1 = (x2[0], x2[1]; x2[2], x2[3])
 	 *	q2 = (x1[0], x1[1]; x1[2], x1[3])
 	 *	q3 = (x3[0], x3[1]; x3[2], x3[3])
 	 *	q4 = (x0[4], x0[5]; x0[6], x0[7])
 	 *	q5 = (x2[4], x2[5]; x2[6], x2[7])
 	 *	q6 = (x1[4], x1[5]; x1[6], x1[7])
 	 *	q7 = (x3[4], x3[5]; x3[6], x3[7])
 	 * The first two rows to write out are q0 = x0[0:4) and q4 =
 	 * x0[4:8).  If we first swap q1 and q4, then once we've
 	 * written them out we free up consecutive registers q0-q1 for
 	 * store-multiple.
 	 */
 	vswp	q1, q4
 	vswp	q3, q6
 	vadd.u32 q0, q0, q9
 	vadd.u32 q4, q4, q9
 	vadd.u32 q2, q2, q9
-	vadd.u32 q3, q3, q9
+	vadd.u32 q6, q6, q9
 	vadd.u32 q1, q1, q8
 	vadd.u32 q5, q5, q8
-	vadd.u32 q6, q6, q8
+	vadd.u32 q3, q3, q8
 	vadd.u32 q7, q7, q8
 	vld1.32 {q8-q9}, [r1]!	/* load plaintext bytes [0:32) */
 	LE32TOH(q0)
 	LE32TOH(q1)
 	LE32TOH(q2)
 	LE32TOH(q6)
 	LE32TOH(q4)
 	LE32TOH(q5)
 	LE32TOH(q3)
 	LE32TOH(q7)
 	veor	q0, q0, q8	/* compute ciphertext bytes [0:32) */
 	veor	q1, q1, q9
 	vld1.32 {q8-q9}, [fp, :256]	/* restore q8-q9 */
 	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [0:32) */
 	vld1.32	{q0}, [r4]	/* q0 := key[16:32) */
 	mov	r3, #0		/* q1 = (0, nonce[0:4), ..., nonce[8:12)) */
 	vmov	d2, r3, r6
 	vmov	d3, r8, r10
 	vzip.32	q8, q9
 	vzip.32	q10, q11
 	vzip.32	q12, q13
 	vzip.32	q14, q15
 	vswp	d17, d20
 	vswp	d25, d28
 	vswp	d19, d22
 	vswp	d25, d28
 	vswp	d17, d20
 	vswp	d27, d30
 	vswp	q9, q12		/* free up q9 earlier for consecutive q8-q9 */
 	vswp	q11, q14
 	vadd.u32 q8, q8, q0
 	vadd.u32 q12, q12, q0
 	vadd.u32 q10, q10, q0
-	vadd.u32 q11, q11, q0
+	vadd.u32 q14, q14, q0
 	vadd.u32 q9, q9, q1
 	vadd.u32 q13, q13, q1
-	vadd.u32 q14, q14, q1
+	vadd.u32 q11, q11, q1
 	vadd.u32 q15, q15, q1
 	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [32:64) */
 	LE32TOH(q8)
 	LE32TOH(q9)
 	LE32TOH(q10)
-	LE32TOH(q14)
+	LE32TOH(q11)
 	LE32TOH(q12)
 	LE32TOH(q13)
-	LE32TOH(q11)
+	LE32TOH(q14)
 	LE32TOH(q15)
 	veor	q0, q0, q8	/* compute ciphertext bytes [32:64) */
 	veor	q1, q1, q9
 	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [64:96) */
 	vst1.32	{q0-q1}, [r0]!	/* store ciphertext bytes [32:64) */
 	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [96:128) */
 	veor	q2, q2, q8	/* compute ciphertext bytes [64:96) */
-	veor	q6, q6, q9
+	veor	q3, q3, q9
 	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [128:160) */
-	vst1.32	{q2}, [r0]!	/* store ciphertext bytes [64:80) */
+	vst1.32	{q2-q3}, [r0]!	/* store ciphertext bytes [64:80) */
 	veor	q10, q10, q0	/* compute ciphertext bytes [96:128) */
-	veor	q14, q14, q1
+	veor	q11, q11, q1
 	vld1.32	{q0-q1}, [r1]!	/* load plaintext bytes [160:192) */
-	vst1.32	{q6}, [r0]!	/* store ciphertext bytes [80:96) */
+	vst1.32	{q10-q11}, [r0]!	/* store ciphertext bytes [80:96) */
 	veor	q4, q4, q8	/* compute ciphertext bytes [128:160) */
 	veor	q5, q5, q9
 	vld1.32	{q8-q9}, [r1]!	/* load plaintext bytes [192:224) */
-	vst1.32	{q10}, [r0]!	/* store ciphertext bytes [96:112) */
+	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [96:112) */
 	veor	q12, q12, q0	/* compute ciphertext bytes [160:192) */
 	veor	q13, q13, q1
 	vld1.32	{q0-q1}, [r1]	/* load plaintext bytes [224:256) */
-	vst1.32	{q14}, [r0]!	/* store ciphertext bytes [112:128) */
+	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [112:128) */
-	veor	q8, q3, q8	/* compute ciphertext bytes [192:224) */
+	veor	q6, q6, q8	/* compute ciphertext bytes [192:224) */
-	veor	q9, q7, q9
+	veor	q7, q7, q9
-	vst1.32	{q4-q5}, [r0]!	/* store ciphertext bytes [128:160) */
+	vst1.32	{q6-q7}, [r0]!	/* store ciphertext bytes [192:224) */
 	vst1.32	{q12-q13}, [r0]!	/* store ciphertext bytes [160:192) */
-	veor	q0, q11, q0	/* compute ciphertext bytes [224:256) */
+	veor	q14, q14, q0	/* compute ciphertext bytes [224:256) */
-	veor	q1, q15, q1
+	veor	q15, q15, q1
-	vst1.32	{q8-q9}, [r0]!	/* store ciphertext bytes [192:224) */
+	vst1.32	{q14-q15}, [r0]	/* store ciphertext bytes [224:256) */
 	vst1.32	{q0-q1}, [r0]	/* store ciphertext bytes [224:256) */
 	/* zero temporary space on the stack */
 	vmov.i32 q0, #0
 	vmov.i32 q1, #0
 	vst1.8	{q0-q1}, [fp, :256]
 	/* restore callee-saves registers and stack */
 	vpop	{d8-d15}
 	pop	{r4, r5, r6, r7, r8, r10, fp, lr}
 	bx	lr
 END(chacha_stream_xor256_neon)
 	.section .rodata
 	.p2align 4
 .Lconstants:
 	.type	v0123,%object
 v0123:
 	.long	0, 1, 2, 3
 END(v0123)
 	.type	rot8,%object
 rot8:
 	.long	0x02010003, 0x06050407
 END(rot8)