Tue Jul 28 20:15:07 2020 UTC ()
Rewrite cprng_fast in terms of new ChaCha API.


(riastradh)
diff -r1.15 -r1.16 src/sys/crypto/cprng_fast/cprng_fast.c
diff -r1.2 -r1.3 src/sys/crypto/cprng_fast/files.cprng_fast
diff -r1.49 -r1.50 src/sys/kern/files.kern
diff -r1.12 -r1.13 src/sys/rump/kern/lib/libcrypto/Makefile
diff -r1.183 -r1.184 src/sys/rump/librump/rumpkern/Makefile.rumpkern

cvs diff -r1.15 -r1.16 src/sys/crypto/cprng_fast/cprng_fast.c (expand / switch to context diff)
--- src/sys/crypto/cprng_fast/cprng_fast.c 2020/04/30 03:29:45 1.15
+++ src/sys/crypto/cprng_fast/cprng_fast.c 2020/07/28 20:15:07 1.16
@@ -1,4 +1,4 @@
-/*	$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $	*/
+/*	$NetBSD: cprng_fast.c,v 1.16 2020/07/28 20:15:07 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.15 2020/04/30 03:29:45 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.16 2020/07/28 20:15:07 riastradh Exp $");
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -42,170 +42,21 @@
 #include <sys/intr.h>
 #include <sys/kmem.h>
 #include <sys/percpu.h>
-
-/* ChaCha core */
 
-#define	crypto_core_OUTPUTWORDS	16
-#define	crypto_core_INPUTWORDS	4
-#define	crypto_core_KEYWORDS	8
-#define	crypto_core_CONSTWORDS	4
-
-#define	crypto_core_ROUNDS	8
-
-static uint32_t
-rotate(uint32_t u, unsigned c)
-{
-
-	return (u << c) | (u >> (32 - c));
-}
-
-#define	QUARTERROUND(a, b, c, d) do {					      \
-	(a) += (b); (d) ^= (a); (d) = rotate((d), 16);			      \
-	(c) += (d); (b) ^= (c); (b) = rotate((b), 12);			      \
-	(a) += (b); (d) ^= (a); (d) = rotate((d),  8);			      \
-	(c) += (d); (b) ^= (c); (b) = rotate((b),  7);			      \
-} while (0)
-
-static void
-crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k,
-    const uint32_t *c)
-{
-	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
-	int i;
-
-	x0 = c[0];
-	x1 = c[1];
-	x2 = c[2];
-	x3 = c[3];
-	x4 = k[0];
-	x5 = k[1];
-	x6 = k[2];
-	x7 = k[3];
-	x8 = k[4];
-	x9 = k[5];
-	x10 = k[6];
-	x11 = k[7];
-	x12 = in[0];
-	x13 = in[1];
-	x14 = in[2];
-	x15 = in[3];
-
-	for (i = crypto_core_ROUNDS; i > 0; i -= 2) {
-		QUARTERROUND( x0, x4, x8,x12);
-		QUARTERROUND( x1, x5, x9,x13);
-		QUARTERROUND( x2, x6,x10,x14);
-		QUARTERROUND( x3, x7,x11,x15);
-		QUARTERROUND( x0, x5,x10,x15);
-		QUARTERROUND( x1, x6,x11,x12);
-		QUARTERROUND( x2, x7, x8,x13);
-		QUARTERROUND( x3, x4, x9,x14);
-	}
-
-	out[0] = x0 + c[0];
-	out[1] = x1 + c[1];
-	out[2] = x2 + c[2];
-	out[3] = x3 + c[3];
-	out[4] = x4 + k[0];
-	out[5] = x5 + k[1];
-	out[6] = x6 + k[2];
-	out[7] = x7 + k[3];
-	out[8] = x8 + k[4];
-	out[9] = x9 + k[5];
-	out[10] = x10 + k[6];
-	out[11] = x11 + k[7];
-	out[12] = x12 + in[0];
-	out[13] = x13 + in[1];
-	out[14] = x14 + in[2];
-	out[15] = x15 + in[3];
-}
+#include <crypto/chacha/chacha.h>
 
-/* `expand 32-byte k' */
-static const uint32_t crypto_core_constant32[4] = {
-	0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
-};
+#define	CPRNG_FAST_SEED_BYTES	CHACHA_STREAM_KEYBYTES
 
-/*
- * Test vector for ChaCha20 from
- * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>,
- * test vectors for ChaCha12 and ChaCha8 generated by the same
- * crypto_core code with crypto_core_ROUNDS varied.
- */
-
-#define	check(E)	do						\
-{									\
-	if (!(E))							\
-		panic("crypto self-test failed: %s", #E);		\
-} while (0)
-
-static void
-crypto_core_selftest(void)
-{
-	const uint32_t zero32[8] = {0};
-	const uint8_t sigma[] = "expand 32-byte k";
-	uint32_t block[16];
-	unsigned i;
-
-#if crypto_core_ROUNDS == 8
-	static const uint8_t out[64] = {
-		0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6,
-		0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1,
-		0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b,
-		0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e,
-		0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41,
-		0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19,
-		0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01,
-		0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42,
-	};
-#elif crypto_core_ROUNDS == 12
-	static const uint8_t out[64] = {
-		0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53,
-		0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5,
-		0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14,
-		0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f,
-		0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0,
-		0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79,
-		0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19,
-		0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe,
-	};
-#elif crypto_core_ROUNDS == 20
-	static const uint8_t out[64] = {
-		0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90,
-		0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28,
-		0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a,
-		0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7,
-		0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d,
-		0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37,
-		0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c,
-		0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86,
-	};
-#else
-#error crypto_core_ROUNDS must be 8, 12, or 20.
-#endif
-
-	check(crypto_core_constant32[0] == le32dec(&sigma[0]));
-	check(crypto_core_constant32[1] == le32dec(&sigma[4]));
-	check(crypto_core_constant32[2] == le32dec(&sigma[8]));
-	check(crypto_core_constant32[3] == le32dec(&sigma[12]));
-
-	crypto_core(block, zero32, zero32, crypto_core_constant32);
-	for (i = 0; i < 16; i++)
-		check(block[i] == le32dec(&out[i*4]));
-}
-
-#undef check
-
-#define	CPRNG_FAST_SEED_BYTES	(crypto_core_KEYWORDS * sizeof(uint32_t))
-
 struct cprng_fast {
-	uint32_t 	buffer[crypto_core_OUTPUTWORDS];
-	uint32_t 	key[crypto_core_KEYWORDS];
-	uint32_t 	nonce[crypto_core_INPUTWORDS];
+	/* 128-bit vector unit generates 256 bytes at once */
+	uint8_t		buf[256];
+	uint8_t		key[CPRNG_FAST_SEED_BYTES];
+	uint8_t		nonce[CHACHA_STREAM_NONCEBYTES];
+	unsigned	i;
 	struct evcnt	*reseed_evcnt;
 	unsigned	epoch;
 };
 
-__CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES);
-
 static void	cprng_fast_init_cpu(void *, void *, struct cpu_info *);
 static void	cprng_fast_schedule_reseed(struct cprng_fast *);
 static void	cprng_fast_intr(void *);
@@ -223,7 +74,6 @@
 cprng_fast_init(void)
 {
 
-	crypto_core_selftest();
 	cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
 	    cprng_fast_init_cpu, NULL, NULL);
 	cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
@@ -247,7 +97,7 @@
 	    ci->ci_cpuname, "cprng_fast reseed");
 }
 
-static inline int
+static int
 cprng_fast_get(struct cprng_fast **cprngp)
 {
 	struct cprng_fast *cprng;
@@ -262,7 +112,7 @@
 	return s;
 }
 
-static inline void
+static void
 cprng_fast_put(struct cprng_fast *cprng, int s)
 {
 
@@ -302,149 +152,80 @@
 
 /* CPRNG algorithm */
 
-/*
- * The state consists of a key, the current nonce, and a 64-byte buffer
- * of output.  Since we fill the buffer only when we need output, and
- * eat a 32-bit word at a time, one 32-bit word of the buffer would be
- * wasted.  Instead, we repurpose it to count the number of entries in
- * the buffer remaining, counting from high to low in order to allow
- * comparison to zero to detect when we need to refill it.
- */
-#define	CPRNG_FAST_BUFIDX	(crypto_core_OUTPUTWORDS - 1)
-
 static void
 cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
 {
 
-	(void)memset(cprng->buffer, 0, sizeof cprng->buffer);
+	(void)memset(cprng->buf, 0, sizeof cprng->buf);
 	(void)memcpy(cprng->key, seed, sizeof cprng->key);
 	(void)memset(cprng->nonce, 0, sizeof cprng->nonce);
+	cprng->i = sizeof cprng->buf;
 }
 
-static inline uint32_t
-cprng_fast_word(struct cprng_fast *cprng)
+static void
+cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned len)
 {
-	uint32_t v;
+	uint8_t *p = buf;
+	unsigned n = len, n0;
 
-	if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) {
-		v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]];
-	} else {
-		/* If we don't have enough words, refill the buffer.  */
-		crypto_core(cprng->buffer, cprng->nonce, cprng->key,
-		    crypto_core_constant32);
-		if (__predict_false(++cprng->nonce[0] == 0)) {
-			cprng->nonce[1]++;
-			cprng_fast_schedule_reseed(cprng);
-		}
-		v = cprng->buffer[CPRNG_FAST_BUFIDX];
-		cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX;
-	}
+	KASSERT(cprng->i <= sizeof(cprng->buf));
+	KASSERT(len <= sizeof(cprng->buf));
 
-	return v;
+	n0 = MIN(n, sizeof(cprng->buf) - cprng->i);
+	memcpy(p, &cprng->buf[cprng->i], n0);
+	if ((n -= n0) == 0) {
+		cprng->i += n0;
+		KASSERT(cprng->i <= sizeof(cprng->buf));
+		return;
+	}
+	p += n0;
+	le64enc(cprng->nonce, 1 + le64dec(cprng->nonce));
+	chacha_stream(cprng->buf, sizeof(cprng->buf), 0, cprng->nonce,
+	    cprng->key, 8);
+	memcpy(p, cprng->buf, n);
+	cprng->i = n;
 }
+
+/* Public API */
 
-static inline void
-cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n)
+static void
+cprng_fast_buf_short(void *buf, size_t len)
 {
-	uint8_t *p = buf;
-	uint32_t v;
-	unsigned w, r;
+	struct cprng_fast *cprng;
+	int s;
 
-	w = n / sizeof(uint32_t);
-	while (w--) {
-		v = cprng_fast_word(cprng);
-		(void)memcpy(p, &v, 4);
-		p += 4;
-	}
+	KASSERT(len <= sizeof(cprng->buf));
 
-	r = n % sizeof(uint32_t);
-	if (r) {
-		v = cprng_fast_word(cprng);
-		while (r--) {
-			*p++ = (v & 0xff);
-			v >>= 8;
-		}
-	}
+	s = cprng_fast_get(&cprng);
+	cprng_fast_buf(cprng, buf, len);
+	cprng_fast_put(cprng, s);
 }
-
-/*
- * crypto_onetimestream: Expand a short unpredictable one-time seed
- * into a long unpredictable output.
- */
+
 static void
-crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf,
-    size_t n)
+cprng_fast_buf_long(void *buf, size_t len)
 {
-	uint32_t block[crypto_core_OUTPUTWORDS];
-	uint32_t nonce[crypto_core_INPUTWORDS] = {0};
-	uint8_t *p8;
-	uint32_t *p32;
-	size_t ni, nb, nf;
+	uint8_t seed[CHACHA_STREAM_KEYBYTES];
+	uint8_t nonce[CHACHA_STREAM_NONCEBYTES] = {0};
 
-	/*
-	 * Guarantee we can generate up to n bytes.  We have
-	 * 2^(32*INPUTWORDS) possible inputs yielding output of
-	 * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes.  It suffices to
-	 * require that sizeof n > (1/CHAR_BIT) log_2 n be less than
-	 * (1/CHAR_BIT) log_2 of the total output stream length.  We
-	 * have
-	 *
-	 *	log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i)
-	 *	  = 2 + log_2 o + 32 i.
-	 */
-	__CTASSERT(CHAR_BIT*sizeof n <=
-	    (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS));
+	CTASSERT(sizeof(seed) <= sizeof(((struct cprng_fast *)0)->buf));
 
-	p8 = buf;
-	p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t));
-	ni = (uint8_t *)p32 - p8;
-	if (n < ni)
-		ni = n;
-	nb = (n - ni) / sizeof block;
-	nf = (n - ni) % sizeof block;
+#if SIZE_MAX >= 0x3fffffffff
+	/* >=256 GB is not reasonable */
+	KASSERT(len <= 0x3fffffffff);
+#endif
 
-	KASSERT(((uintptr_t)p32 & 3) == 0);
-	KASSERT(ni <= n);
-	KASSERT(nb <= (n / sizeof block));
-	KASSERT(nf <= n);
-	KASSERT(n == (ni + (nb * sizeof block) + nf));
-	KASSERT(ni < sizeof(uint32_t));
-	KASSERT(nf < sizeof block);
+	cprng_fast_buf_short(seed, sizeof seed);
+	chacha_stream(buf, len, 0, nonce, seed, 8);
 
-	if (ni) {
-		crypto_core(block, nonce, seed, crypto_core_constant32);
-		nonce[0]++;
-		(void)memcpy(p8, block, ni);
-	}
-	while (nb--) {
-		crypto_core(p32, nonce, seed, crypto_core_constant32);
-		if (++nonce[0] == 0)
-			nonce[1]++;
-		p32 += crypto_core_OUTPUTWORDS;
-	}
-	if (nf) {
-		crypto_core(block, nonce, seed, crypto_core_constant32);
-		if (++nonce[0] == 0)
-			nonce[1]++;
-		(void)memcpy(p32, block, nf);
-	}
-
-	if (ni | nf)
-		(void)explicit_memset(block, 0, sizeof block);
+	(void)explicit_memset(seed, 0, sizeof seed);
 }
-
-/* Public API */
 
 uint32_t
 cprng_fast32(void)
 {
-	struct cprng_fast *cprng;
 	uint32_t v;
-	int s;
 
-	s = cprng_fast_get(&cprng);
-	v = cprng_fast_word(cprng);
-	cprng_fast_put(cprng, s);
+	cprng_fast_buf_short(&v, sizeof v);
 
 	return v;
 }
@@ -452,45 +233,13 @@
 uint64_t
 cprng_fast64(void)
 {
-	struct cprng_fast *cprng;
-	uint32_t hi, lo;
-	int s;
+	uint64_t v;
 
-	s = cprng_fast_get(&cprng);
-	hi = cprng_fast_word(cprng);
-	lo = cprng_fast_word(cprng);
-	cprng_fast_put(cprng, s);
+	cprng_fast_buf_short(&v, sizeof v);
 
-	return ((uint64_t)hi << 32) | lo;
+	return v;
 }
 
-static void
-cprng_fast_buf_short(void *buf, size_t len)
-{
-	struct cprng_fast *cprng;
-	int s;
-
-	s = cprng_fast_get(&cprng);
-	cprng_fast_buf(cprng, buf, len);
-	cprng_fast_put(cprng, s);
-}
-
-static __noinline void
-cprng_fast_buf_long(void *buf, size_t len)
-{
-	uint32_t seed[crypto_core_KEYWORDS];
-	struct cprng_fast *cprng;
-	int s;
-
-	s = cprng_fast_get(&cprng);
-	cprng_fast_buf(cprng, seed, sizeof seed);
-	cprng_fast_put(cprng, s);
-
-	crypto_onetimestream(seed, buf, len);
-
-	(void)explicit_memset(seed, 0, sizeof seed);
-}
-
 size_t
 cprng_fast(void *buf, size_t len)
 {
@@ -498,12 +247,12 @@
 	/*
 	 * We don't want to hog the CPU, so we use the short version,
 	 * to generate output without preemption, only if we can do it
-	 * with at most one crypto_core.
+	 * with at most one ChaCha call.
 	 */
-	if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS))
+	if (len <= sizeof(((struct cprng_fast *)0)->buf))
 		cprng_fast_buf_short(buf, len);
 	else
 		cprng_fast_buf_long(buf, len);
 
-	return len;
+	return len;		/* hysterical raisins */
 }

cvs diff -r1.2 -r1.3 src/sys/crypto/cprng_fast/files.cprng_fast (expand / switch to context diff)
--- src/sys/crypto/cprng_fast/files.cprng_fast 2014/08/10 16:44:35 1.2
+++ src/sys/crypto/cprng_fast/files.cprng_fast 2020/07/28 20:15:07 1.3
@@ -1,3 +1,5 @@
-#	$NetBSD: files.cprng_fast,v 1.2 2014/08/10 16:44:35 tls Exp $
+#	$NetBSD: files.cprng_fast,v 1.3 2020/07/28 20:15:07 riastradh Exp $
 
-file	crypto/cprng_fast/cprng_fast.c
+define	cprng_fast: chacha
+
+file	crypto/cprng_fast/cprng_fast.c	cprng_fast

cvs diff -r1.49 -r1.50 src/sys/kern/files.kern (expand / switch to context diff)
--- src/sys/kern/files.kern 2020/06/07 09:45:19 1.49
+++ src/sys/kern/files.kern 2020/07/28 20:15:07 1.50
@@ -1,9 +1,9 @@
-#	$NetBSD: files.kern,v 1.49 2020/06/07 09:45:19 maxv Exp $
+#	$NetBSD: files.kern,v 1.50 2020/07/28 20:15:07 riastradh Exp $
 
 #
 # kernel sources
 #
-define	kern:	machdep, uvm
+define	kern:	cprng_fast, machdep, uvm
 defflag	opt_kern.h			KERN
 defflag	opt_script.h			SETUIDSCRIPTS FDSCRIPTS
 defflag					KASLR

cvs diff -r1.12 -r1.13 src/sys/rump/kern/lib/libcrypto/Makefile (expand / switch to context diff)
--- src/sys/rump/kern/lib/libcrypto/Makefile 2020/07/26 04:25:49 1.12
+++ src/sys/rump/kern/lib/libcrypto/Makefile 2020/07/28 20:15:07 1.13
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.12 2020/07/26 04:25:49 riastradh Exp $
+#	$NetBSD: Makefile,v 1.13 2020/07/28 20:15:07 riastradh Exp $
 #
 
 .PATH:	${.CURDIR}/../../../../crypto/adiantum				\
@@ -6,7 +6,6 @@
 	${.CURDIR}/../../../../crypto/blowfish				\
 	${.CURDIR}/../../../../crypto/camellia				\
 	${.CURDIR}/../../../../crypto/cast128				\
-	${.CURDIR}/../../../../crypto/chacha				\
 	${.CURDIR}/../../../../crypto/des				\
 	${.CURDIR}/../../../../crypto/skipjack
 
@@ -35,11 +34,6 @@
 
 # cast128
 SRCS+=	cast128.c
-
-# ChaCha
-SRCS+=	chacha_impl.c
-SRCS+=	chacha_ref.c
-SRCS+=	chacha_selftest.c
 
 # DES
 SRCS+=	des_ecb.c des_setkey.c des_enc.c des_cbc.c des_module.c

cvs diff -r1.183 -r1.184 src/sys/rump/librump/rumpkern/Makefile.rumpkern (expand / switch to context diff)
--- src/sys/rump/librump/rumpkern/Makefile.rumpkern 2020/04/30 03:28:19 1.183
+++ src/sys/rump/librump/rumpkern/Makefile.rumpkern 2020/07/28 20:15:07 1.184
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile.rumpkern,v 1.183 2020/04/30 03:28:19 riastradh Exp $
+#	$NetBSD: Makefile.rumpkern,v 1.184 2020/07/28 20:15:07 riastradh Exp $
 #
 
 IOCONFDIR:=	${.PARSEDIR}
@@ -17,6 +17,7 @@
 	${RUMPTOP}/../dev					\
 	${RUMPTOP}/../crypto/nist_hash_drbg			\
 	${RUMPTOP}/../crypto/cprng_fast				\
+	${RUMPTOP}/../crypto/chacha				\
 	${RUMPTOP}/../secmodel					\
 	${RUMPTOP}/../secmodel/suser				\
 	${RUMPTOP}/../compat/common
@@ -156,6 +157,9 @@
 # are available from the rumpkern_crypto component
 SRCS+=	nist_hash_drbg.c
 SRCS+=	cprng_fast.c
+SRCS+=	chacha_impl.c
+SRCS+=	chacha_ref.c
+SRCS+=	chacha_selftest.c
 
 .include "${RUMPTOP}/Makefile.rump"
 .include <bsd.own.mk>