Draft 2x vectorized neon vpaes for aarch64. Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in cgd tests, for parallelizable operations like CBC decryption; same improvement should probably carry over to rpi4 CPU which lacks ARMv8.0-AES.diff -r1.3 -r1.4 src/sys/crypto/aes/arch/arm/aes_neon.c
(riastradh)
--- src/sys/crypto/aes/arch/arm/aes_neon.c 2020/06/30 20:32:11 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon.c 2020/07/28 20:11:09 1.4
@@ -1,14 +1,14 @@ | @@ -1,14 +1,14 @@ | |||
1 | /* $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $ */ | 1 | /* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
@@ -29,27 +29,27 @@ | @@ -29,27 +29,27 @@ | |||
29 | /* | 29 | /* | |
30 | * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES | 30 | * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES | |
31 | * software, at <https://crypto.stanford.edu/vpaes/>, described in | 31 | * software, at <https://crypto.stanford.edu/vpaes/>, described in | |
32 | * | 32 | * | |
33 | * Mike Hamburg, `Accelerating AES with Vector Permute | 33 | * Mike Hamburg, `Accelerating AES with Vector Permute | |
34 | * Instructions', in Christophe Clavier and Kris Gaj (eds.), | 34 | * Instructions', in Christophe Clavier and Kris Gaj (eds.), | |
35 | * Cryptographic Hardware and Embedded Systems -- CHES 2009, | 35 | * Cryptographic Hardware and Embedded Systems -- CHES 2009, | |
36 | * Springer LNCS 5747, pp. 18-32. | 36 | * Springer LNCS 5747, pp. 18-32. | |
37 | * | 37 | * | |
38 | * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 | 38 | * https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2 | |
39 | */ | 39 | */ | |
40 | 40 | |||
41 | #include <sys/cdefs.h> | 41 | #include <sys/cdefs.h> | |
42 | __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $"); | 42 | __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); | |
43 | 43 | |||
44 | #include <sys/types.h> | 44 | #include <sys/types.h> | |
45 | 45 | |||
46 | #ifdef _KERNEL | 46 | #ifdef _KERNEL | |
47 | #include <sys/systm.h> | 47 | #include <sys/systm.h> | |
48 | #else | 48 | #else | |
49 | #include <err.h> | 49 | #include <err.h> | |
50 | #define panic(fmt, args...) err(1, fmt, ##args) | 50 | #define panic(fmt, args...) err(1, fmt, ##args) | |
51 | #endif | 51 | #endif | |
52 | 52 | |||
53 | #include "aes_neon_impl.h" | 53 | #include "aes_neon_impl.h" | |
54 | 54 | |||
55 | #ifdef __aarch64__ | 55 | #ifdef __aarch64__ | |
@@ -579,26 +579,79 @@ aes_neon_enc1(const struct aesenc *enc, | @@ -579,26 +579,79 @@ aes_neon_enc1(const struct aesenc *enc, | |||
579 | 579 | |||
580 | A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); | 580 | A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo); | |
581 | A ^= loadroundkey(rk32); | 581 | A ^= loadroundkey(rk32); | |
582 | A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); | 582 | A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo); | |
583 | A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); | 583 | A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]); | |
584 | A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); | 584 | A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]); | |
585 | x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); | 585 | x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]); | |
586 | } | 586 | } | |
587 | x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); | 587 | x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo); | |
588 | x ^= loadroundkey(rk32); | 588 | x ^= loadroundkey(rk32); | |
589 | return vqtbl1q_u8(x, sr[rmod4]); | 589 | return vqtbl1q_u8(x, sr[rmod4]); | |
590 | } | 590 | } | |
591 | 591 | |||
592 | uint8x16x2_t | |||
593 | aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds) | |||
594 | { | |||
595 | const uint32_t *rk32 = enc->aese_aes.aes_rk; | |||
596 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |||
597 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |||
598 | uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0]; | |||
599 | uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1]; | |||
600 | uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0]; | |||
601 | uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1]; | |||
602 | uint8x16_t x0 = x.val[0], x1 = x.val[1]; | |||
603 | uint8x16_t io0, jo0, io1, jo1; | |||
604 | unsigned rmod4 = 0; | |||
605 | ||||
606 | x0 = aes_schedule_transform(x0, ipt); | |||
607 | x1 = aes_schedule_transform(x1, ipt); | |||
608 | x0 ^= loadroundkey(rk32); | |||
609 | x1 ^= loadroundkey(rk32); | |||
610 | for (;;) { | |||
611 | uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0; | |||
612 | uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1; | |||
613 | ||||
614 | subbytes(&io0, &jo0, x0, inv_, inva_); | |||
615 | subbytes(&io1, &jo1, x1, inv_, inva_); | |||
616 | ||||
617 | rk32 += 4; | |||
618 | rmod4 = (rmod4 + 1) % 4; | |||
619 | if (--nrounds == 0) | |||
620 | break; | |||
621 | ||||
622 | A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0); | |||
623 | A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1); | |||
624 | A_0 ^= loadroundkey(rk32); | |||
625 | A_1 ^= loadroundkey(rk32); | |||
626 | A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0); | |||
627 | A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1); | |||
628 | A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]); | |||
629 | A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]); | |||
630 | A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]); | |||
631 | A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]); | |||
632 | x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]); | |||
633 | x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]); | |||
634 | } | |||
635 | x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0); | |||
636 | x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1); | |||
637 | x0 ^= loadroundkey(rk32); | |||
638 | x1 ^= loadroundkey(rk32); | |||
639 | return (uint8x16x2_t) { .val = { | |||
640 | [0] = vqtbl1q_u8(x0, sr[rmod4]), | |||
641 | [1] = vqtbl1q_u8(x1, sr[rmod4]), | |||
642 | } }; | |||
643 | } | |||
644 | ||||
592 | uint8x16_t | 645 | uint8x16_t | |
593 | aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) | 646 | aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds) | |
594 | { | 647 | { | |
595 | const uint32_t *rk32 = dec->aesd_aes.aes_rk; | 648 | const uint32_t *rk32 = dec->aesd_aes.aes_rk; | |
596 | unsigned i = 3 & ~(nrounds - 1); | 649 | unsigned i = 3 & ~(nrounds - 1); | |
597 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | 650 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |
598 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | 651 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |
599 | uint8x16_t io, jo, mc; | 652 | uint8x16_t io, jo, mc; | |
600 | 653 | |||
601 | x = aes_schedule_transform(x, dipt); | 654 | x = aes_schedule_transform(x, dipt); | |
602 | x ^= loadroundkey(rk32); | 655 | x ^= loadroundkey(rk32); | |
603 | rk32 += 4; | 656 | rk32 += 4; | |
604 | 657 | |||
@@ -618,14 +671,70 @@ aes_neon_dec1(const struct aesdec *dec, | @@ -618,14 +671,70 @@ aes_neon_dec1(const struct aesdec *dec, | |||
618 | x = vqtbl1q_u8(x, mc); | 671 | x = vqtbl1q_u8(x, mc); | |
619 | x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); | 672 | x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo); | |
620 | 673 | |||
621 | x = vqtbl1q_u8(x, mc); | 674 | x = vqtbl1q_u8(x, mc); | |
622 | x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); | 675 | x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo); | |
623 | 676 | |||
624 | mc = vextq_u8(mc, mc, 12); | 677 | mc = vextq_u8(mc, mc, 12); | |
625 | } | 678 | } | |
626 | x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); | 679 | x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo); | |
627 | x ^= loadroundkey(rk32); | 680 | x ^= loadroundkey(rk32); | |
628 | return vqtbl1q_u8(x, sr[i]); | 681 | return vqtbl1q_u8(x, sr[i]); | |
629 | } | 682 | } | |
630 | 683 | |||
684 | uint8x16x2_t | |||
685 | aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds) | |||
686 | { | |||
687 | const uint32_t *rk32 = dec->aesd_aes.aes_rk; | |||
688 | unsigned i = 3 & ~(nrounds - 1); | |||
689 | uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv; | |||
690 | uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva; | |||
691 | uint8x16_t x0 = x.val[0], x1 = x.val[1]; | |||
692 | uint8x16_t io0, jo0, io1, jo1, mc; | |||
693 | ||||
694 | x0 = aes_schedule_transform(x0, dipt); | |||
695 | x1 = aes_schedule_transform(x1, dipt); | |||
696 | x0 ^= loadroundkey(rk32); | |||
697 | x1 ^= loadroundkey(rk32); | |||
698 | rk32 += 4; | |||
699 | ||||
700 | mc = mc_forward[3]; | |||
701 | for (;;) { | |||
702 | subbytes(&io0, &jo0, x0, inv_, inva_); | |||
703 | subbytes(&io1, &jo1, x1, inv_, inva_); | |||
704 | if (--nrounds == 0) | |||
705 | break; | |||
706 | ||||
707 | x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0); | |||
708 | x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1); | |||
709 | x0 ^= loadroundkey(rk32); | |||
710 | x1 ^= loadroundkey(rk32); | |||
711 | rk32 += 4; /* next round key */ | |||
712 | ||||
713 | x0 = vqtbl1q_u8(x0, mc); | |||
714 | x1 = vqtbl1q_u8(x1, mc); | |||
715 | x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0); | |||
716 | x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1); | |||
717 | ||||
718 | x0 = vqtbl1q_u8(x0, mc); | |||
719 | x1 = vqtbl1q_u8(x1, mc); | |||
720 | x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0); | |||
721 | x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1); | |||
722 | ||||
723 | x0 = vqtbl1q_u8(x0, mc); | |||
724 | x1 = vqtbl1q_u8(x1, mc); | |||
725 | x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0); | |||
726 | x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1); | |||
727 | ||||
728 | mc = vextq_u8(mc, mc, 12); | |||
729 | } | |||
730 | x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0); | |||
731 | x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1); | |||
732 | x0 ^= loadroundkey(rk32); | |||
733 | x1 ^= loadroundkey(rk32); | |||
734 | return (uint8x16x2_t) { .val = { | |||
735 | [0] = vqtbl1q_u8(x0, sr[i]), | |||
736 | [1] = vqtbl1q_u8(x1, sr[i]), | |||
737 | } }; | |||
738 | } | |||
739 | ||||
631 | #endif | 740 | #endif |
--- src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/25 22:36:06 1.3
+++ src/sys/crypto/aes/arch/arm/aes_neon_subr.c 2020/07/28 20:11:09 1.4
@@ -1,14 +1,14 @@ | @@ -1,14 +1,14 @@ | |||
1 | /* $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $ */ | 1 | /* $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
@@ -17,27 +17,27 @@ | @@ -17,27 +17,27 @@ | |||
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | 17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
26 | * POSSIBILITY OF SUCH DAMAGE. | 26 | * POSSIBILITY OF SUCH DAMAGE. | |
27 | */ | 27 | */ | |
28 | 28 | |||
29 | #include <sys/cdefs.h> | 29 | #include <sys/cdefs.h> | |
30 | __KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $"); | 30 | __KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $"); | |
31 | 31 | |||
32 | #include <sys/endian.h> | 32 | #include <sys/endian.h> | |
33 | 33 | |||
34 | #ifdef _KERNEL | 34 | #ifdef _KERNEL | |
35 | #include <sys/systm.h> | 35 | #include <sys/systm.h> | |
36 | #include <lib/libkern/libkern.h> | 36 | #include <lib/libkern/libkern.h> | |
37 | #else | 37 | #else | |
38 | #include <assert.h> | 38 | #include <assert.h> | |
39 | #include <inttypes.h> | 39 | #include <inttypes.h> | |
40 | #include <stdio.h> | 40 | #include <stdio.h> | |
41 | #define KASSERT assert | 41 | #define KASSERT assert | |
42 | #endif | 42 | #endif | |
43 | 43 | |||
@@ -101,34 +101,53 @@ void | @@ -101,34 +101,53 @@ void | |||
101 | aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], | 101 | aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16], | |
102 | uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], | 102 | uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16], | |
103 | uint32_t nrounds) | 103 | uint32_t nrounds) | |
104 | { | 104 | { | |
105 | uint8x16_t iv0, cv, b; | 105 | uint8x16_t iv0, cv, b; | |
106 | 106 | |||
107 | KASSERT(nbytes); | 107 | KASSERT(nbytes); | |
108 | KASSERT(nbytes % 16 == 0); | 108 | KASSERT(nbytes % 16 == 0); | |
109 | 109 | |||
110 | iv0 = loadblock(iv); | 110 | iv0 = loadblock(iv); | |
111 | cv = loadblock(in + nbytes - 16); | 111 | cv = loadblock(in + nbytes - 16); | |
112 | storeblock(iv, cv); | 112 | storeblock(iv, cv); | |
113 | 113 | |||
114 | for (;;) { | 114 | if (nbytes % 32) { | |
115 | KASSERT(nbytes % 32 == 16); | |||
115 | b = aes_neon_dec1(dec, cv, nrounds); | 116 | b = aes_neon_dec1(dec, cv, nrounds); | |
116 | if ((nbytes -= 16) == 0) | 117 | if ((nbytes -= 16) == 0) | |
117 | break; | 118 | goto out; | |
119 | cv = loadblock(in + nbytes - 16); | |||
120 | storeblock(out + nbytes, cv ^ b); | |||
121 | } | |||
122 | ||||
123 | for (;;) { | |||
124 | uint8x16x2_t b2; | |||
125 | ||||
126 | KASSERT(nbytes >= 32); | |||
127 | ||||
128 | b2.val[1] = cv; | |||
129 | b2.val[0] = cv = loadblock(in + nbytes - 32); | |||
130 | b2 = aes_neon_dec2(dec, b2, nrounds); | |||
131 | storeblock(out + nbytes - 16, cv ^ b2.val[1]); | |||
132 | if ((nbytes -= 32) == 0) { | |||
133 | b = b2.val[0]; | |||
134 | goto out; | |||
135 | } | |||
118 | cv = loadblock(in + nbytes - 16); | 136 | cv = loadblock(in + nbytes - 16); | |
119 | storeblock(out + nbytes, b ^ cv); | 137 | storeblock(out + nbytes, cv ^ b2.val[0]); | |
120 | } | 138 | } | |
121 | storeblock(out, b ^ iv0); | 139 | ||
140 | out: storeblock(out, b ^ iv0); | |||
122 | } | 141 | } | |
123 | 142 | |||
124 | static inline uint8x16_t | 143 | static inline uint8x16_t | |
125 | aes_neon_xts_update(uint8x16_t t8) | 144 | aes_neon_xts_update(uint8x16_t t8) | |
126 | { | 145 | { | |
127 | const int32x4_t zero = vdupq_n_s32(0); | 146 | const int32x4_t zero = vdupq_n_s32(0); | |
128 | const int32x4_t carry = {0x87, 1, 1, 1}; | 147 | const int32x4_t carry = {0x87, 1, 1, 1}; | |
129 | int32x4_t t, t_; | 148 | int32x4_t t, t_; | |
130 | uint32x4_t mask; | 149 | uint32x4_t mask; | |
131 | 150 | |||
132 | t = vreinterpretq_s32_u8(t8); | 151 | t = vreinterpretq_s32_u8(t8); | |
133 | mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ | 152 | mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ | |
134 | mask = vextq_u32(mask, mask, 3); /* rotate quarters */ | 153 | mask = vextq_u32(mask, mask, 3); /* rotate quarters */ | |
@@ -176,51 +195,85 @@ aes_neon_xts_update_selftest(void) | @@ -176,51 +195,85 @@ aes_neon_xts_update_selftest(void) | |||
176 | } | 195 | } | |
177 | 196 | |||
178 | void | 197 | void | |
179 | aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], | 198 | aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16], | |
180 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | 199 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | |
181 | uint32_t nrounds) | 200 | uint32_t nrounds) | |
182 | { | 201 | { | |
183 | uint8x16_t t, b; | 202 | uint8x16_t t, b; | |
184 | 203 | |||
185 | KASSERT(nbytes); | 204 | KASSERT(nbytes); | |
186 | KASSERT(nbytes % 16 == 0); | 205 | KASSERT(nbytes % 16 == 0); | |
187 | 206 | |||
188 | t = loadblock(tweak); | 207 | t = loadblock(tweak); | |
189 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 208 | if (nbytes % 32) { | |
209 | KASSERT(nbytes % 32 == 16); | |||
190 | b = t ^ loadblock(in); | 210 | b = t ^ loadblock(in); | |
191 | b = aes_neon_enc1(enc, b, nrounds); | 211 | b = aes_neon_enc1(enc, b, nrounds); | |
192 | storeblock(out, t ^ b); | 212 | storeblock(out, t ^ b); | |
193 | t = aes_neon_xts_update(t); | 213 | t = aes_neon_xts_update(t); | |
214 | nbytes -= 16; | |||
215 | in += 16; | |||
216 | out += 16; | |||
217 | } | |||
218 | for (; nbytes; nbytes -= 32, in += 32, out += 32) { | |||
219 | uint8x16_t t1; | |||
220 | uint8x16x2_t b2; | |||
221 | ||||
222 | t1 = aes_neon_xts_update(t); | |||
223 | b2.val[0] = t ^ loadblock(in); | |||
224 | b2.val[1] = t1 ^ loadblock(in + 16); | |||
225 | b2 = aes_neon_enc2(enc, b2, nrounds); | |||
226 | storeblock(out, b2.val[0] ^ t); | |||
227 | storeblock(out + 16, b2.val[1] ^ t1); | |||
228 | ||||
229 | t = aes_neon_xts_update(t1); | |||
194 | } | 230 | } | |
195 | storeblock(tweak, t); | 231 | storeblock(tweak, t); | |
196 | } | 232 | } | |
197 | 233 | |||
198 | void | 234 | void | |
199 | aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], | 235 | aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16], | |
200 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | 236 | uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16], | |
201 | uint32_t nrounds) | 237 | uint32_t nrounds) | |
202 | { | 238 | { | |
203 | uint8x16_t t, b; | 239 | uint8x16_t t, b; | |
204 | 240 | |||
205 | KASSERT(nbytes); | 241 | KASSERT(nbytes); | |
206 | KASSERT(nbytes % 16 == 0); | 242 | KASSERT(nbytes % 16 == 0); | |
207 | 243 | |||
208 | t = loadblock(tweak); | 244 | t = loadblock(tweak); | |
209 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 245 | if (nbytes % 32) { | |
246 | KASSERT(nbytes % 32 == 16); | |||
210 | b = t ^ loadblock(in); | 247 | b = t ^ loadblock(in); | |
211 | b = aes_neon_dec1(dec, b, nrounds); | 248 | b = aes_neon_dec1(dec, b, nrounds); | |
212 | storeblock(out, t ^ b); | 249 | storeblock(out, t ^ b); | |
213 | t = aes_neon_xts_update(t); | 250 | t = aes_neon_xts_update(t); | |
251 | nbytes -= 16; | |||
252 | in += 16; | |||
253 | out += 16; | |||
254 | } | |||
255 | for (; nbytes; nbytes -= 32, in += 32, out += 32) { | |||
256 | uint8x16_t t1; | |||
257 | uint8x16x2_t b2; | |||
258 | ||||
259 | t1 = aes_neon_xts_update(t); | |||
260 | b2.val[0] = t ^ loadblock(in); | |||
261 | b2.val[1] = t1 ^ loadblock(in + 16); | |||
262 | b2 = aes_neon_dec2(dec, b2, nrounds); | |||
263 | storeblock(out, b2.val[0] ^ t); | |||
264 | storeblock(out + 16, b2.val[1] ^ t1); | |||
265 | ||||
266 | t = aes_neon_xts_update(t1); | |||
214 | } | 267 | } | |
215 | storeblock(tweak, t); | 268 | storeblock(tweak, t); | |
216 | } | 269 | } | |
217 | 270 | |||
218 | void | 271 | void | |
219 | aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], | 272 | aes_neon_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16], | |
220 | size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) | 273 | size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds) | |
221 | { | 274 | { | |
222 | uint8x16_t auth; | 275 | uint8x16_t auth; | |
223 | 276 | |||
224 | KASSERT(nbytes); | 277 | KASSERT(nbytes); | |
225 | KASSERT(nbytes % 16 == 0); | 278 | KASSERT(nbytes % 16 == 0); | |
226 | 279 | |||
@@ -252,58 +305,78 @@ aes_neon_ccm_enc1(const struct aesenc *e | @@ -252,58 +305,78 @@ aes_neon_ccm_enc1(const struct aesenc *e | |||
252 | uint32_t nrounds) | 305 | uint32_t nrounds) | |
253 | { | 306 | { | |
254 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | 307 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | |
255 | uint8x16_t auth, ptxt, ctr_be; | 308 | uint8x16_t auth, ptxt, ctr_be; | |
256 | uint32x4_t ctr; | 309 | uint32x4_t ctr; | |
257 | 310 | |||
258 | KASSERT(nbytes); | 311 | KASSERT(nbytes); | |
259 | KASSERT(nbytes % 16 == 0); | 312 | KASSERT(nbytes % 16 == 0); | |
260 | 313 | |||
261 | auth = loadblock(authctr); | 314 | auth = loadblock(authctr); | |
262 | ctr_be = loadblock(authctr + 16); | 315 | ctr_be = loadblock(authctr + 16); | |
263 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | 316 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | |
264 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 317 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | |
318 | uint8x16x2_t b2; | |||
265 | ptxt = loadblock(in); | 319 | ptxt = loadblock(in); | |
266 | auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); | |||
267 | ctr = vaddq_u32(ctr, ctr32_inc); | 320 | ctr = vaddq_u32(ctr, ctr32_inc); | |
268 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | 321 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | |
269 | storeblock(out, ptxt ^ aes_neon_enc1(enc, ctr_be, nrounds)); | 322 | ||
323 | b2.val[0] = auth ^ ptxt; | |||
324 | b2.val[1] = ctr_be; | |||
325 | b2 = aes_neon_enc2(enc, b2, nrounds); | |||
326 | auth = b2.val[0]; | |||
327 | storeblock(out, ptxt ^ b2.val[1]); | |||
270 | } | 328 | } | |
271 | storeblock(authctr, auth); | 329 | storeblock(authctr, auth); | |
272 | storeblock(authctr + 16, ctr_be); | 330 | storeblock(authctr + 16, ctr_be); | |
273 | } | 331 | } | |
274 | 332 | |||
275 | void | 333 | void | |
276 | aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], | 334 | aes_neon_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16], | |
277 | uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], | 335 | uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], | |
278 | uint32_t nrounds) | 336 | uint32_t nrounds) | |
279 | { | 337 | { | |
280 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | 338 | const uint32x4_t ctr32_inc = {0, 0, 0, 1}; | |
281 | uint8x16_t auth, ctr_be, ptxt; | 339 | uint8x16_t auth, ctr_be, ptxt, pad; | |
282 | uint32x4_t ctr; | 340 | uint32x4_t ctr; | |
283 | 341 | |||
284 | KASSERT(nbytes); | 342 | KASSERT(nbytes); | |
285 | KASSERT(nbytes % 16 == 0); | 343 | KASSERT(nbytes % 16 == 0); | |
286 | 344 | |||
287 | auth = loadblock(authctr); | |||
288 | ctr_be = loadblock(authctr + 16); | 345 | ctr_be = loadblock(authctr + 16); | |
289 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | 346 | ctr = vreinterpretq_u32_u8(vbetoh32q_u8(ctr_be)); | |
290 | for (; nbytes; nbytes -= 16, in += 16, out += 16) { | 347 | ctr = vaddq_u32(ctr, ctr32_inc); | |
348 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | |||
349 | pad = aes_neon_enc1(enc, ctr_be, nrounds); | |||
350 | auth = loadblock(authctr); | |||
351 | for (;; in += 16, out += 16) { | |||
352 | uint8x16x2_t b2; | |||
353 | ||||
354 | ptxt = loadblock(in) ^ pad; | |||
355 | auth ^= ptxt; | |||
356 | storeblock(out, ptxt); | |||
357 | ||||
358 | if ((nbytes -= 16) == 0) | |||
359 | break; | |||
360 | ||||
291 | ctr = vaddq_u32(ctr, ctr32_inc); | 361 | ctr = vaddq_u32(ctr, ctr32_inc); | |
292 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | 362 | ctr_be = vhtobe32q_u8(vreinterpretq_u8_u32(ctr)); | |
293 | ptxt = loadblock(in) ^ aes_neon_enc1(enc, ctr_be, nrounds); | 363 | b2.val[0] = auth; | |
294 | storeblock(out, ptxt); | 364 | b2.val[1] = ctr_be; | |
295 | auth = aes_neon_enc1(enc, auth ^ ptxt, nrounds); | 365 | b2 = aes_neon_enc2(enc, b2, nrounds); | |
366 | auth = b2.val[0]; | |||
367 | pad = b2.val[1]; | |||
296 | } | 368 | } | |
369 | auth = aes_neon_enc1(enc, auth, nrounds); | |||
297 | storeblock(authctr, auth); | 370 | storeblock(authctr, auth); | |
298 | storeblock(authctr + 16, ctr_be); | 371 | storeblock(authctr + 16, ctr_be); | |
299 | } | 372 | } | |
300 | 373 | |||
301 | int | 374 | int | |
302 | aes_neon_selftest(void) | 375 | aes_neon_selftest(void) | |
303 | { | 376 | { | |
304 | 377 | |||
305 | if (aes_neon_xts_update_selftest()) | 378 | if (aes_neon_xts_update_selftest()) | |
306 | return -1; | 379 | return -1; | |
307 | 380 | |||
308 | return 0; | 381 | return 0; | |
309 | } | 382 | } |
--- src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/06/29 23:56:31 1.1
+++ src/sys/crypto/aes/arch/arm/aes_neon_impl.h 2020/07/28 20:11:09 1.2
@@ -1,14 +1,14 @@ | @@ -1,14 +1,14 @@ | |||
1 | /* $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $ */ | 1 | /* $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
@@ -29,14 +29,43 @@ | @@ -29,14 +29,43 @@ | |||
29 | #ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | 29 | #ifndef _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | |
30 | #define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | 30 | #define _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H | |
31 | 31 | |||
32 | #include <sys/types.h> | 32 | #include <sys/types.h> | |
33 | 33 | |||
34 | #include "arm_neon.h" | 34 | #include "arm_neon.h" | |
35 | 35 | |||
36 | #include <crypto/aes/aes.h> | 36 | #include <crypto/aes/aes.h> | |
37 | #include <crypto/aes/arch/arm/aes_neon.h> | 37 | #include <crypto/aes/arch/arm/aes_neon.h> | |
38 | 38 | |||
39 | uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); | 39 | uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned); | |
40 | uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); | 40 | uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned); | |
41 | 41 | |||
42 | #ifdef __aarch64__ | |||
43 | ||||
44 | uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned); | |||
45 | uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned); | |||
46 | ||||
47 | #else | |||
48 | ||||
49 | static inline uint8x16x2_t | |||
50 | aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds) | |||
51 | { | |||
52 | ||||
53 | return (uint8x16x2_t) { .val = { | |||
54 | [0] = aes_neon_enc1(enc, b2.val[0], nrounds), | |||
55 | [1] = aes_neon_enc1(enc, b2.val[1], nrounds), | |||
56 | } }; | |||
57 | } | |||
58 | ||||
59 | static inline uint8x16x2_t | |||
60 | aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds) | |||
61 | { | |||
62 | ||||
63 | return (uint8x16x2_t) { .val = { | |||
64 | [0] = aes_neon_dec1(dec, b2.val[0], nrounds), | |||
65 | [1] = aes_neon_dec1(dec, b2.val[1], nrounds), | |||
66 | } }; | |||
67 | } | |||
68 | ||||
69 | #endif | |||
70 | ||||
42 | #endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ | 71 | #endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */ |
--- src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/25 22:43:01 1.6
+++ src/sys/crypto/aes/arch/arm/arm_neon.h 2020/07/28 20:11:09 1.7
@@ -1,14 +1,14 @@ | @@ -1,14 +1,14 @@ | |||
1 | /* $NetBSD: arm_neon.h,v 1.6 2020/07/25 22:43:01 riastradh Exp $ */ | 1 | /* $NetBSD: arm_neon.h,v 1.7 2020/07/28 20:11:09 riastradh Exp $ */ | |
2 | 2 | |||
3 | /*- | 3 | /*- | |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | |
5 | * All rights reserved. | 5 | * All rights reserved. | |
6 | * | 6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | 7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | 8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | 9 | * are met: | |
10 | * 1. Redistributions of source code must retain the above copyright | 10 | * 1. Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | 11 | * notice, this list of conditions and the following disclaimer. | |
12 | * 2. Redistributions in binary form must reproduce the above copyright | 12 | * 2. Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in the | 13 | * notice, this list of conditions and the following disclaimer in the | |
14 | * documentation and/or other materials provided with the distribution. | 14 | * documentation and/or other materials provided with the distribution. | |
@@ -32,38 +32,40 @@ | @@ -32,38 +32,40 @@ | |||
32 | #if defined(__GNUC__) && !defined(__clang__) | 32 | #if defined(__GNUC__) && !defined(__clang__) | |
33 | 33 | |||
34 | #define _INTRINSATTR \ | 34 | #define _INTRINSATTR \ | |
35 | __extension__ \ | 35 | __extension__ \ | |
36 | __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | 36 | __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | |
37 | 37 | |||
38 | #ifdef __aarch64__ | 38 | #ifdef __aarch64__ | |
39 | typedef __Int32x4_t int32x4_t; | 39 | typedef __Int32x4_t int32x4_t; | |
40 | typedef __Int64x2_t int64x2_t; | 40 | typedef __Int64x2_t int64x2_t; | |
41 | typedef __Int8x16_t int8x16_t; | 41 | typedef __Int8x16_t int8x16_t; | |
42 | typedef __Uint32x4_t uint32x4_t; | 42 | typedef __Uint32x4_t uint32x4_t; | |
43 | typedef __Uint64x2_t uint64x2_t; | 43 | typedef __Uint64x2_t uint64x2_t; | |
44 | typedef __Uint8x16_t uint8x16_t; | 44 | typedef __Uint8x16_t uint8x16_t; | |
45 | typedef struct { uint8x16_t val[2]; } uint8x16x2_t; | |||
45 | #else | 46 | #else | |
46 | typedef __simd128_int32_t int32x4_t; | 47 | typedef __simd128_int32_t int32x4_t; | |
47 | typedef __simd128_int64_t int64x2_t; | 48 | typedef __simd128_int64_t int64x2_t; | |
48 | typedef __simd128_int8_t int8x16_t; | 49 | typedef __simd128_int8_t int8x16_t; | |
49 | typedef __simd128_uint32_t uint32x4_t; | 50 | typedef __simd128_uint32_t uint32x4_t; | |
50 | typedef __simd128_uint64_t uint64x2_t; | 51 | typedef __simd128_uint64_t uint64x2_t; | |
51 | typedef __simd128_uint8_t uint8x16_t; | 52 | typedef __simd128_uint8_t uint8x16_t; | |
52 | 53 | |||
53 | typedef __simd64_int8_t int8x8_t; | 54 | typedef __simd64_int8_t int8x8_t; | |
54 | typedef __simd64_uint8_t uint8x8_t; | 55 | typedef __simd64_uint8_t uint8x8_t; | |
55 | typedef __builtin_neon_udi uint64x1_t; | 56 | typedef __builtin_neon_udi uint64x1_t; | |
56 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | 57 | typedef struct { uint8x8_t val[2]; } uint8x8x2_t; | |
58 | typedef struct { uint8x16_t val[2]; } uint8x16x2_t; | |||
57 | #endif | 59 | #endif | |
58 | 60 | |||
59 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | 61 | #if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN) | |
60 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | 62 | #define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i) | |
61 | #else | 63 | #else | |
62 | #define __neon_lane_index(__v, __i) __i | 64 | #define __neon_lane_index(__v, __i) __i | |
63 | #endif | 65 | #endif | |
64 | 66 | |||
65 | #elif defined(__clang__) | 67 | #elif defined(__clang__) | |
66 | 68 | |||
67 | #define _INTRINSATTR \ | 69 | #define _INTRINSATTR \ | |
68 | __attribute__((__always_inline__, __nodebug__)) | 70 | __attribute__((__always_inline__, __nodebug__)) | |
69 | 71 |