Tue Jul 28 20:11:09 2020 UTC ()
Draft 2x vectorized neon vpaes for aarch64.

Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in
cgd tests, for parallelizable operations like CBC decryption; same
improvement should probably carry over to rpi4 CPU which lacks
ARMv8.0-AES.


(riastradh)
diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c
diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_subr.c
diff -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h
diff -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h

cvs diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c (expand / switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_neon.c 2020/06/30 20:32:11 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon.c 2020/07/28 20:11:09 1.4
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */ 1/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -29,27 +29,27 @@ @@ -29,27 +29,27 @@
29/* 29/*
30 * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES 30 * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
31 * software, at <https://crypto.stanford.edu/vpaes/>, described in 31 * software, at <https://crypto.stanford.edu/vpaes/>, described in
32 * 32 *
33 * Mike Hamburg, `Accelerating AES with Vector Permute 33 * Mike Hamburg, `Accelerating AES with Vector Permute
34 * Instructions', in Christophe Clavier and Kris Gaj (eds.), 34 * Instructions', in Christophe Clavier and Kris Gaj (eds.),
35 * Cryptographic Hardware and Embedded Systems -- CHES 2009, 35 * Cryptographic Hardware and Embedded Systems -- CHES 2009,
36 * Springer LNCS 5747, pp. 18-32. 36 * Springer LNCS 5747, pp. 18-32.
37 * 37 *
38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 38 * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39 */ 39 */
40 40
41#include <sys/cdefs.h> 41#include <sys/cdefs.h>
42__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $"); 42__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
43 43
44#include <sys/types.h> 44#include <sys/types.h>
45 45
46#ifdef _KERNEL 46#ifdef _KERNEL
47#include <sys/systm.h> 47#include <sys/systm.h>
48#else 48#else
49#include <err.h> 49#include <err.h>
50#define panic(fmt, args...) err(1, fmt, ##args) 50#define panic(fmt, args...) err(1, fmt, ##args)
51#endif 51#endif
52 52
53#include "aes_neon_impl.h" 53#include "aes_neon_impl.h"
54 54
55#ifdef __aarch64__ 55#ifdef __aarch64__
@@ -579,26 +579,79 @@ aes_neon_enc1(const struct aesenc *enc,  @@ -579,26 +579,79 @@ aes_neon_enc1(const struct aesenc *enc,
579 579
580 A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); 580 A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
581 A ^= loadroundkey(rk32); 581 A ^= loadroundkey(rk32);
582 A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); 582 A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
583 A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); 583 A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
584 A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); 584 A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
585 x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); 585 x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
586 } 586 }
587 x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); 587 x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
588 x ^= loadroundkey(rk32); 588 x ^= loadroundkey(rk32);
589 return vqtbl1q_u8(x, sr[rmod4]); 589 return vqtbl1q_u8(x, sr[rmod4]);
590} 590}
591 591
 592uint8x16x2_t
 593aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
 594{
 595 const uint32_t *rk32 = enc->aese_aes.aes_rk;
 596 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 597 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 598 uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
 599 uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
 600 uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
 601 uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
 602 uint8x16_t x0 = x.val[0], x1 = x.val[1];
 603 uint8x16_t io0, jo0, io1, jo1;
 604 unsigned rmod4 = 0;
 605
 606 x0 = aes_schedule_transform(x0, ipt);
 607 x1 = aes_schedule_transform(x1, ipt);
 608 x0 ^= loadroundkey(rk32);
 609 x1 ^= loadroundkey(rk32);
 610 for (;;) {
 611 uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
 612 uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
 613
 614 subbytes(&io0, &jo0, x0, inv_, inva_);
 615 subbytes(&io1, &jo1, x1, inv_, inva_);
 616
 617 rk32 += 4;
 618 rmod4 = (rmod4 + 1) % 4;
 619 if (--nrounds == 0)
 620 break;
 621
 622 A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
 623 A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
 624 A_0 ^= loadroundkey(rk32);
 625 A_1 ^= loadroundkey(rk32);
 626 A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
 627 A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
 628 A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
 629 A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
 630 A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
 631 A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
 632 x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
 633 x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
 634 }
 635 x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
 636 x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
 637 x0 ^= loadroundkey(rk32);
 638 x1 ^= loadroundkey(rk32);
 639 return (uint8x16x2_t) { .val = {
 640 [0] = vqtbl1q_u8(x0, sr[rmod4]),
 641 [1] = vqtbl1q_u8(x1, sr[rmod4]),
 642 } };
 643}
 644
592uint8x16_t 645uint8x16_t
593aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) 646aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
594{ 647{
595 const uint32_t *rk32 = dec->aesd_aes.aes_rk; 648 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
596 unsigned i = 3 & ~(nrounds - 1); 649 unsigned i = 3 & ~(nrounds - 1);
597 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; 650 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
598 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; 651 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
599 uint8x16_t io, jo, mc; 652 uint8x16_t io, jo, mc;
600 653
601 x = aes_schedule_transform(x, dipt); 654 x = aes_schedule_transform(x, dipt);
602 x ^= loadroundkey(rk32); 655 x ^= loadroundkey(rk32);
603 rk32 += 4; 656 rk32 += 4;
604 657
@@ -618,14 +671,70 @@ aes_neon_dec1(const struct aesdec *dec,  @@ -618,14 +671,70 @@ aes_neon_dec1(const struct aesdec *dec,
618 x = vqtbl1q_u8(x, mc); 671 x = vqtbl1q_u8(x, mc);
619 x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); 672 x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
620 673
621 x = vqtbl1q_u8(x, mc); 674 x = vqtbl1q_u8(x, mc);
622 x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); 675 x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
623 676
624 mc = vextq_u8(mc, mc, 12); 677 mc = vextq_u8(mc, mc, 12);
625 } 678 }
626 x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); 679 x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
627 x ^= loadroundkey(rk32); 680 x ^= loadroundkey(rk32);
628 return vqtbl1q_u8(x, sr[i]); 681 return vqtbl1q_u8(x, sr[i]);
629} 682}
630 683
 684uint8x16x2_t
 685aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
 686{
 687 const uint32_t *rk32 = dec->aesd_aes.aes_rk;
 688 unsigned i = 3 & ~(nrounds - 1);
 689 uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
 690 uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
 691 uint8x16_t x0 = x.val[0], x1 = x.val[1];
 692 uint8x16_t io0, jo0, io1, jo1, mc;
 693
 694 x0 = aes_schedule_transform(x0, dipt);
 695 x1 = aes_schedule_transform(x1, dipt);
 696 x0 ^= loadroundkey(rk32);
 697 x1 ^= loadroundkey(rk32);
 698 rk32 += 4;
 699
 700 mc = mc_forward[3];
 701 for (;;) {
 702 subbytes(&io0, &jo0, x0, inv_, inva_);
 703 subbytes(&io1, &jo1, x1, inv_, inva_);
 704 if (--nrounds == 0)
 705 break;
 706
 707 x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
 708 x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
 709 x0 ^= loadroundkey(rk32);
 710 x1 ^= loadroundkey(rk32);
 711 rk32 += 4; /* next round key */
 712
 713 x0 = vqtbl1q_u8(x0, mc);
 714 x1 = vqtbl1q_u8(x1, mc);
 715 x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
 716 x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
 717
 718 x0 = vqtbl1q_u8(x0, mc);
 719 x1 = vqtbl1q_u8(x1, mc);
 720 x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
 721 x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
 722
 723 x0 = vqtbl1q_u8(x0, mc);
 724 x1 = vqtbl1q_u8(x1, mc);
 725 x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
 726 x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
 727
 728 mc = vextq_u8(mc, mc, 12);
 729 }
 730 x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
 731 x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
 732 x0 ^= loadroundkey(rk32);
 733 x1 ^= loadroundkey(rk32);
 734 return (uint8x16x2_t) { .val = {
 735 [0] = vqtbl1q_u8(x0, sr[i]),
 736 [1] = vqtbl1q_u8(x1, sr[i]),
 737 } };
 738}
 739
631#endif 740#endif

cvs diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon_subr.c (expand / switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/25 22:36:06 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/28 20:11:09 1.4
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ 1/* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -17,27 +17,27 @@ @@ -17,27 +17,27 @@
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE. 26 * POSSIBILITY OF SUCH DAMAGE.
27 */ 27 */
28 28
29#include <sys/cdefs.h> 29#include <sys/cdefs.h>
30__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $"); 30__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
31 31
32#include <sys/endian.h> 32#include <sys/endian.h>
33 33
34#ifdef _KERNEL 34#ifdef _KERNEL
35#include <sys/systm.h> 35#include <sys/systm.h>
36#include <lib/libkern/libkern.h> 36#include <lib/libkern/libkern.h>
37#else 37#else
38#include <assert.h> 38#include <assert.h>
39#include <inttypes.h> 39#include <inttypes.h>
40#include <stdio.h> 40#include <stdio.h>
41#define KASSERT assert 41#define KASSERT assert
42#endif 42#endif
43 43
@@ -101,34 +101,53 @@ void @@ -101,34 +101,53 @@ void
101aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], 101aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
102 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], 102 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
103 uint32_t nrounds) 103 uint32_t nrounds)
104{ 104{
105 uint8x16_t iv0, cv, b; 105 uint8x16_t iv0, cv, b;
106 106
107 KASSERT(nbytes); 107 KASSERT(nbytes);
108 KASSERT(nbytes % 16 == 0); 108 KASSERT(nbytes % 16 == 0);
109 109
110 iv0 = loadblock(iv); 110 iv0 = loadblock(iv);
111 cv = loadblock(in + nbytes - 16); 111 cv = loadblock(in + nbytes - 16);
112 storeblock(iv, cv); 112 storeblock(iv, cv);
113 113
114 for (;;) { 114 if (nbytes % 32) {
 115 KASSERT(nbytes % 32 == 16);
115 b = aes_neon_dec1(dec, cv, nrounds); 116 b = aes_neon_dec1(dec, cv, nrounds);
116 if ((nbytes -= 16) == 0) 117 if ((nbytes -= 16) == 0)
117 break; 118 goto out;
 119 cv = loadblock(in + nbytes - 16);
 120 storeblock(out + nbytes, cv ^ b);
 121 }
 122
 123 for (;;) {
 124 uint8x16x2_t b2;
 125
 126 KASSERT(nbytes >= 32);
 127
 128 b2.val[1] = cv;
 129 b2.val[0] = cv = loadblock(in + nbytes - 32);
 130 b2 = aes_neon_dec2(dec, b2, nrounds);
 131 storeblock(out + nbytes - 16, cv ^ b2.val[1]);
 132 if ((nbytes -= 32) == 0) {
 133 b = b2.val[0];
 134 goto out;
 135 }
118 cv = loadblock(in + nbytes - 16); 136 cv = loadblock(in + nbytes - 16);
119 storeblock(out + nbytes, b ^ cv); 137 storeblock(out + nbytes, cv ^ b2.val[0]);
120 } 138 }
121 storeblock(out, b ^ iv0); 139
 140out: storeblock(out, b ^ iv0);
122} 141}
123 142
124static inline uint8x16_t 143static inline uint8x16_t
125aes_neon_xts_update(uint8x16_t t8) 144aes_neon_xts_update(uint8x16_t t8)
126{ 145{
127 const int32x4_t zero = vdupq_n_s32(0); 146 const int32x4_t zero = vdupq_n_s32(0);
128 const int32x4_t carry = {0x87, 1, 1, 1}; 147 const int32x4_t carry = {0x87, 1, 1, 1};
129 int32x4_t t, t_; 148 int32x4_t t, t_;
130 uint32x4_t mask; 149 uint32x4_t mask;
131 150
132 t = vreinterpretq_s32_u8(t8); 151 t = vreinterpretq_s32_u8(t8);
133 mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ 152 mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */
134 mask = vextq_u32(mask, mask, 3); /* rotate quarters */ 153 mask = vextq_u32(mask, mask, 3); /* rotate quarters */
@@ -176,51 +195,85 @@ aes_neon_xts_update_selftest(void) @@ -176,51 +195,85 @@ aes_neon_xts_update_selftest(void)
176} 195}
177 196
178void 197void
179aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], 198aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
180 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], 199 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
181 uint32_t nrounds) 200 uint32_t nrounds)
182{ 201{
183 uint8x16_t t, b; 202 uint8x16_t t, b;
184 203
185 KASSERT(nbytes); 204 KASSERT(nbytes);
186 KASSERT(nbytes % 16 == 0); 205 KASSERT(nbytes % 16 == 0);
187 206
188 t = loadblock(tweak); 207 t = loadblock(tweak);
189 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 208 if (nbytes % 32) {
 209 KASSERT(nbytes % 32 == 16);
190 b = t ^ loadblock(in); 210 b = t ^ loadblock(in);
191 b = aes_neon_enc1(enc, b, nrounds); 211 b = aes_neon_enc1(enc, b, nrounds);
192 storeblock(out, t ^ b); 212 storeblock(out, t ^ b);
193 t = aes_neon_xts_update(t); 213 t = aes_neon_xts_update(t);
 214 nbytes -= 16;
 215 in += 16;
 216 out += 16;
 217 }
 218 for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 219 uint8x16_t t1;
 220 uint8x16x2_t b2;
 221
 222 t1 = aes_neon_xts_update(t);
 223 b2.val[0] = t ^ loadblock(in);
 224 b2.val[1] = t1 ^ loadblock(in + 16);
 225 b2 = aes_neon_enc2(enc, b2, nrounds);
 226 storeblock(out, b2.val[0] ^ t);
 227 storeblock(out + 16, b2.val[1] ^ t1);
 228
 229 t = aes_neon_xts_update(t1);
194 } 230 }
195 storeblock(tweak, t); 231 storeblock(tweak, t);
196} 232}
197 233
198void 234void
199aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], 235aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
200 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], 236 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
201 uint32_t nrounds) 237 uint32_t nrounds)
202{ 238{
203 uint8x16_t t, b; 239 uint8x16_t t, b;
204 240
205 KASSERT(nbytes); 241 KASSERT(nbytes);
206 KASSERT(nbytes % 16 == 0); 242 KASSERT(nbytes % 16 == 0);
207 243
208 t = loadblock(tweak); 244 t = loadblock(tweak);
209 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 245 if (nbytes % 32) {
 246 KASSERT(nbytes % 32 == 16);
210 b = t ^ loadblock(in); 247 b = t ^ loadblock(in);
211 b = aes_neon_dec1(dec, b, nrounds); 248 b = aes_neon_dec1(dec, b, nrounds);
212 storeblock(out, t ^ b); 249 storeblock(out, t ^ b);
213 t = aes_neon_xts_update(t); 250 t = aes_neon_xts_update(t);
 251 nbytes -= 16;
 252 in += 16;
 253 out += 16;
 254 }
 255 for (; nbytes; nbytes -= 32, in += 32, out += 32) {
 256 uint8x16_t t1;
 257 uint8x16x2_t b2;
 258
 259 t1 = aes_neon_xts_update(t);
 260 b2.val[0] = t ^ loadblock(in);
 261 b2.val[1] = t1 ^ loadblock(in + 16);
 262 b2 = aes_neon_dec2(dec, b2, nrounds);
 263 storeblock(out, b2.val[0] ^ t);
 264 storeblock(out + 16, b2.val[1] ^ t1);
 265
 266 t = aes_neon_xts_update(t1);
214 } 267 }
215 storeblock(tweak, t); 268 storeblock(tweak, t);
216} 269}
217 270
218void 271void
219aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], 272aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
220 size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) 273 size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
221{ 274{
222 uint8x16_t auth; 275 uint8x16_t auth;
223 276
224 KASSERT(nbytes); 277 KASSERT(nbytes);
225 KASSERT(nbytes % 16 == 0); 278 KASSERT(nbytes % 16 == 0);
226 279
@@ -252,58 +305,78 @@ aes_neon_ccm_enc1(const struct aesenc *e @@ -252,58 +305,78 @@ aes_neon_ccm_enc1(const struct aesenc *e
252 uint32_t nrounds) 305 uint32_t nrounds)
253{ 306{
254 const uint32x4_t ctr32_inc = {0, 0, 0, 1}; 307 const uint32x4_t ctr32_inc = {0, 0, 0, 1};
255 uint8x16_t auth, ptxt, ctr_be; 308 uint8x16_t auth, ptxt, ctr_be;
256 uint32x4_t ctr; 309 uint32x4_t ctr;
257 310
258 KASSERT(nbytes); 311 KASSERT(nbytes);
259 KASSERT(nbytes % 16 == 0); 312 KASSERT(nbytes % 16 == 0);
260 313
261 auth = loadblock(authctr); 314 auth = loadblock(authctr);
262 ctr_be = loadblock(authctr + 16); 315 ctr_be = loadblock(authctr + 16);
263 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); 316 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
264 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 317 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
 318 uint8x16x2_t b2;
265 ptxt = loadblock(in); 319 ptxt = loadblock(in);
266 auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); 
267 ctr = vaddq_u32(ctr, ctr32_inc); 320 ctr = vaddq_u32(ctr, ctr32_inc);
268 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); 321 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
269 storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds)); 322
 323 b2.val[0] = auth ^ ptxt;
 324 b2.val[1] = ctr_be;
 325 b2 = aes_neon_enc2(enc, b2, nrounds);
 326 auth = b2.val[0];
 327 storeblock(out, ptxt ^ b2.val[1]);
270 } 328 }
271 storeblock(authctr, auth); 329 storeblock(authctr, auth);
272 storeblock(authctr + 16, ctr_be); 330 storeblock(authctr + 16, ctr_be);
273} 331}
274 332
275void 333void
276aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], 334aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
277 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], 335 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
278 uint32_t nrounds) 336 uint32_t nrounds)
279{ 337{
280 const uint32x4_t ctr32_inc = {0, 0, 0, 1}; 338 const uint32x4_t ctr32_inc = {0, 0, 0, 1};
281 uint8x16_t auth, ctr_be, ptxt; 339 uint8x16_t auth, ctr_be, ptxt, pad;
282 uint32x4_t ctr; 340 uint32x4_t ctr;
283 341
284 KASSERT(nbytes); 342 KASSERT(nbytes);
285 KASSERT(nbytes % 16 == 0); 343 KASSERT(nbytes % 16 == 0);
286 344
287 auth = loadblock(authctr); 
288 ctr_be = loadblock(authctr + 16); 345 ctr_be = loadblock(authctr + 16);
289 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); 346 ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be));
290 for (; nbytes; nbytes -= 16, in += 16, out += 16) { 347 ctr = vaddq_u32(ctr, ctr32_inc);
 348 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
 349 pad = aes_neon_enc1(enc, ctr_be, nrounds);
 350 auth = loadblock(authctr);
 351 for (;; in += 16, out += 16) {
 352 uint8x16x2_t b2;
 353
 354 ptxt = loadblock(in) ^ pad;
 355 auth ^= ptxt;
 356 storeblock(out, ptxt);
 357
 358 if ((nbytes -= 16) == 0)
 359 break;
 360
291 ctr = vaddq_u32(ctr, ctr32_inc); 361 ctr = vaddq_u32(ctr, ctr32_inc);
292 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); 362 ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr));
293 ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds); 363 b2.val[0] = auth;
294 storeblock(out, ptxt); 364 b2.val[1] = ctr_be;
295 auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); 365 b2 = aes_neon_enc2(enc, b2, nrounds);
 366 auth = b2.val[0];
 367 pad = b2.val[1];
296 } 368 }
 369 auth = aes_neon_enc1(enc, auth, nrounds);
297 storeblock(authctr, auth); 370 storeblock(authctr, auth);
298 storeblock(authctr + 16, ctr_be); 371 storeblock(authctr + 16, ctr_be);
299} 372}
300 373
301int 374int
302aes_neon_selftest(void) 375aes_neon_selftest(void)
303{ 376{
304 377
305 if (aes_neon_xts_update_selftest()) 378 if (aes_neon_xts_update_selftest())
306 return -1; 379 return -1;
307 380
308 return 0; 381 return 0;
309} 382}

cvs diff -r1.1 -r1.2 src/sys/crypto/aes/arch/arm/aes_neon_impl.h (expand / switch to unified diff)

--- src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/06/29 23:56:31 1.1
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/07/28 20:11:09 1.2
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ 1/* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -29,14 +29,43 @@ @@ -29,14 +29,43 @@
29#ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H 29#ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H
30#define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H 30#define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H
31 31
32#include <sys/types.h> 32#include <sys/types.h>
33 33
34#include "arm_neon.h" 34#include "arm_neon.h"
35 35
36#include <crypto/aes/aes.h> 36#include <crypto/aes/aes.h>
37#include <crypto/aes/arch/arm/aes_neon.h> 37#include <crypto/aes/arch/arm/aes_neon.h>
38 38
39uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); 39uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned);
40uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); 40uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned);
41 41
 42#ifdef __aarch64__
 43
 44uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned);
 45uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned);
 46
 47#else
 48
 49static inline uint8x16x2_t
 50aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds)
 51{
 52
 53 return (uint8x16x2_t) { .val = {
 54 [0] = aes_neon_enc1(enc, b2.val[0], nrounds),
 55 [1] = aes_neon_enc1(enc, b2.val[1], nrounds),
 56 } };
 57}
 58
 59static inline uint8x16x2_t
 60aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds)
 61{
 62
 63 return (uint8x16x2_t) { .val = {
 64 [0] = aes_neon_dec1(dec, b2.val[0], nrounds),
 65 [1] = aes_neon_dec1(dec, b2.val[1], nrounds),
 66 } };
 67}
 68
 69#endif
 70
42#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ 71#endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */

cvs diff -r1.6 -r1.7 src/sys/crypto/aes/arch/arm/arm_neon.h (expand / switch to unified diff)

--- src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/25 22:43:01 1.6
+++ src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/28 20:11:09 1.7
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */ 1/* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -32,38 +32,40 @@ @@ -32,38 +32,40 @@
32#if defined(__GNUC__) && !defined(__clang__) 32#if defined(__GNUC__) && !defined(__clang__)
33 33
34#define _INTRINSATTR \ 34#define _INTRINSATTR \
35 __extension__ \ 35 __extension__ \
36 __attribute__((__always_inline__, __gnu_inline__, __artificial__)) 36 __attribute__((__always_inline__, __gnu_inline__, __artificial__))
37 37
38#ifdef __aarch64__ 38#ifdef __aarch64__
39typedef __Int32x4_t int32x4_t; 39typedef __Int32x4_t int32x4_t;
40typedef __Int64x2_t int64x2_t; 40typedef __Int64x2_t int64x2_t;
41typedef __Int8x16_t int8x16_t; 41typedef __Int8x16_t int8x16_t;
42typedef __Uint32x4_t uint32x4_t; 42typedef __Uint32x4_t uint32x4_t;
43typedef __Uint64x2_t uint64x2_t; 43typedef __Uint64x2_t uint64x2_t;
44typedef __Uint8x16_t uint8x16_t; 44typedef __Uint8x16_t uint8x16_t;
 45typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
45#else 46#else
46typedef __simd128_int32_t int32x4_t; 47typedef __simd128_int32_t int32x4_t;
47typedef __simd128_int64_t int64x2_t; 48typedef __simd128_int64_t int64x2_t;
48typedef __simd128_int8_t int8x16_t; 49typedef __simd128_int8_t int8x16_t;
49typedef __simd128_uint32_t uint32x4_t; 50typedef __simd128_uint32_t uint32x4_t;
50typedef __simd128_uint64_t uint64x2_t; 51typedef __simd128_uint64_t uint64x2_t;
51typedef __simd128_uint8_t uint8x16_t; 52typedef __simd128_uint8_t uint8x16_t;
52 53
53typedef __simd64_int8_t int8x8_t; 54typedef __simd64_int8_t int8x8_t;
54typedef __simd64_uint8_t uint8x8_t; 55typedef __simd64_uint8_t uint8x8_t;
55typedef __builtin_neon_udi uint64x1_t; 56typedef __builtin_neon_udi uint64x1_t;
56typedef struct { uint8x8_t val[2]; } uint8x8x2_t; 57typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
 58typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
57#endif 59#endif
58 60
59#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) 61#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
60#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) 62#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
61#else 63#else
62#define __neon_lane_index(__v, __i) __i 64#define __neon_lane_index(__v, __i) __i
63#endif 65#endif
64 66
65#elif defined(__clang__) 67#elif defined(__clang__)
66 68
67#define _INTRINSATTR \ 69#define _INTRINSATTR \
68 __attribute__((__always_inline__, __nodebug__)) 70 __attribute__((__always_inline__, __nodebug__))
69 71