Mon Jun 29 23:29:40 2020 UTC ()
Add x86 AES-NI support.

Limited to amd64 for now.  In principle, AES-NI should work in 32-bit
mode, and there may even be some 32-bit-only CPUs that support
AES-NI, but that requires work to adapt the assembly.


(riastradh)
diff -r1.111 -r1.112 src/sys/arch/x86/conf/files.x86
diff -r1.107 -r1.108 src/sys/arch/x86/x86/identcpu.c
diff -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_ni.c
diff -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_ni.h
diff -r0 -r1.1 src/sys/crypto/aes/arch/x86/aes_ni_64.S
diff -r0 -r1.1 src/sys/crypto/aes/arch/x86/files.aesni

cvs diff -r1.111 -r1.112 src/sys/arch/x86/conf/files.x86 (expand / switch to unified diff)

--- src/sys/arch/x86/conf/files.x86 2020/05/06 19:45:12 1.111
+++ src/sys/arch/x86/conf/files.x86 2020/06/29 23:29:39 1.112
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1# $NetBSD: files.x86,v 1.111 2020/05/06 19:45:12 bouyer Exp $ 1# $NetBSD: files.x86,v 1.112 2020/06/29 23:29:39 riastradh Exp $
2 2
3# options for MP configuration through the MP spec 3# options for MP configuration through the MP spec
4defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI 4defflag opt_mpbios.h MPBIOS MPDEBUG MPBIOS_SCANPCI
5defparam opt_mpbios.h MPVERBOSE 5defparam opt_mpbios.h MPVERBOSE
6 6
7# MTRR support 7# MTRR support
8defflag MTRR 8defflag MTRR
9 9
10# Interrupt debug 10# Interrupt debug
11defflag opt_intrdebug.h INTRDEBUG 11defflag opt_intrdebug.h INTRDEBUG
12 12
13# PCI fixup options 13# PCI fixup options
14defflag opt_pcifixup.h PCI_ADDR_FIXUP PCI_BUS_FIXUP 14defflag opt_pcifixup.h PCI_ADDR_FIXUP PCI_BUS_FIXUP
@@ -155,13 +155,16 @@ file arch/x86/x86/x86_ipmi.c ipmi needs @@ -155,13 +155,16 @@ file arch/x86/x86/x86_ipmi.c ipmi needs
155 155
156file arch/x86/x86/vga_post.c vga_post 156file arch/x86/x86/vga_post.c vga_post
157 157
158file arch/x86/pci/pci_machdep.c pci 158file arch/x86/pci/pci_machdep.c pci
159#file arch/x86/pci/pci_ranges.c pci 159#file arch/x86/pci/pci_ranges.c pci
160file arch/x86/pci/pci_intr_machdep.c pci 160file arch/x86/pci/pci_intr_machdep.c pci
161file arch/x86/pci/pci_msi_machdep.c pci & ! no_pci_msi_msix 161file arch/x86/pci/pci_msi_machdep.c pci & ! no_pci_msi_msix
162file arch/x86/pci/msipic.c pci & ! no_pci_msi_msix 162file arch/x86/pci/msipic.c pci & ! no_pci_msi_msix
163 163
164file arch/x86/pci/pciide_machdep.c pciide_common 164file arch/x86/pci/pciide_machdep.c pciide_common
165 165
166file arch/x86/pci/pci_bus_fixup.c pci_bus_fixup 166file arch/x86/pci/pci_bus_fixup.c pci_bus_fixup
167file arch/x86/pci/pci_addr_fixup.c pci_addr_fixup 167file arch/x86/pci/pci_addr_fixup.c pci_addr_fixup
 168
 169# AES-NI
 170include "crypto/aes/arch/x86/files.aesni"

cvs diff -r1.107 -r1.108 src/sys/arch/x86/x86/identcpu.c (expand / switch to unified diff)

--- src/sys/arch/x86/x86/identcpu.c 2020/04/25 15:26:18 1.107
+++ src/sys/arch/x86/x86/identcpu.c 2020/06/29 23:29:39 1.108
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: identcpu.c,v 1.107 2020/04/25 15:26:18 bouyer Exp $ */ 1/* $NetBSD: identcpu.c,v 1.108 2020/06/29 23:29:39 riastradh Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1999, 2000, 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Frank van der Linden, and by Jason R. Thorpe. 8 * by Frank van der Linden, and by Jason R. Thorpe.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -20,35 +20,37 @@ @@ -20,35 +20,37 @@
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32#include <sys/cdefs.h> 32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.107 2020/04/25 15:26:18 bouyer Exp $"); 33__KERNEL_RCSID(0, "$NetBSD: identcpu.c,v 1.108 2020/06/29 23:29:39 riastradh Exp $");
34 34
35#include "opt_xen.h" 35#include "opt_xen.h"
36 36
37#include <sys/param.h> 37#include <sys/param.h>
38#include <sys/systm.h> 38#include <sys/systm.h>
39#include <sys/device.h> 39#include <sys/device.h>
40#include <sys/cpu.h> 40#include <sys/cpu.h>
41 41
 42#include <crypto/aes/arch/x86/aes_ni.h>
 43
42#include <uvm/uvm_extern.h> 44#include <uvm/uvm_extern.h>
43 45
44#include <machine/specialreg.h> 46#include <machine/specialreg.h>
45#include <machine/pio.h> 47#include <machine/pio.h>
46#include <machine/cpu.h> 48#include <machine/cpu.h>
47 49
48#include <x86/cputypes.h> 50#include <x86/cputypes.h>
49#include <x86/cacheinfo.h> 51#include <x86/cacheinfo.h>
50#include <x86/cpuvar.h> 52#include <x86/cpuvar.h>
51#include <x86/fpu.h> 53#include <x86/fpu.h>
52 54
53#include <x86/x86/vmtreg.h> /* for vmt_hvcall() */ 55#include <x86/x86/vmtreg.h> /* for vmt_hvcall() */
54#include <x86/x86/vmtvar.h> /* for vmt_hvcall() */ 56#include <x86/x86/vmtvar.h> /* for vmt_hvcall() */
@@ -985,26 +987,30 @@ cpu_probe(struct cpu_info *ci) @@ -985,26 +987,30 @@ cpu_probe(struct cpu_info *ci)
985 } 987 }
986 988
987 ci->ci_feat_val[0] &= ~CPUID_FEAT_BLACKLIST; 989 ci->ci_feat_val[0] &= ~CPUID_FEAT_BLACKLIST;
988 if (ci == &cpu_info_primary) { 990 if (ci == &cpu_info_primary) {
989 /* If first. Boot Processor is the cpu_feature reference. */ 991 /* If first. Boot Processor is the cpu_feature reference. */
990 for (i = 0; i < __arraycount(cpu_feature); i++) { 992 for (i = 0; i < __arraycount(cpu_feature); i++) {
991 cpu_feature[i] = ci->ci_feat_val[i]; 993 cpu_feature[i] = ci->ci_feat_val[i];
992 } 994 }
993 identify_hypervisor(); 995 identify_hypervisor();
994#ifndef XENPV 996#ifndef XENPV
995 /* Early patch of text segment. */ 997 /* Early patch of text segment. */
996 x86_patch(true); 998 x86_patch(true);
997#endif 999#endif
 1000#ifdef __x86_64__ /* not yet implemented on i386 */
 1001 if (cpu_feature[1] & CPUID2_AES)
 1002 aes_md_init(&aes_ni_impl);
 1003#endif
998 } else { 1004 } else {
999 /* 1005 /*
1000 * If not first. Warn about cpu_feature mismatch for 1006 * If not first. Warn about cpu_feature mismatch for
1001 * secondary CPUs. 1007 * secondary CPUs.
1002 */ 1008 */
1003 for (i = 0; i < __arraycount(cpu_feature); i++) { 1009 for (i = 0; i < __arraycount(cpu_feature); i++) {
1004 if (cpu_feature[i] != ci->ci_feat_val[i]) 1010 if (cpu_feature[i] != ci->ci_feat_val[i])
1005 aprint_error_dev(ci->ci_dev, 1011 aprint_error_dev(ci->ci_dev,
1006 "feature mismatch: cpu_feature[%d] is " 1012 "feature mismatch: cpu_feature[%d] is "
1007 "%#x, but CPU reported %#x\n", 1013 "%#x, but CPU reported %#x\n",
1008 i, cpu_feature[i], ci->ci_feat_val[i]); 1014 i, cpu_feature[i], ci->ci_feat_val[i]);
1009 } 1015 }
1010 } 1016 }

File Added: src/sys/crypto/aes/arch/x86/aes_ni.c
/*	$NetBSD: aes_ni.c,v 1.1 2020/06/29 23:29:40 riastradh Exp $	*/

/*-
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: aes_ni.c,v 1.1 2020/06/29 23:29:40 riastradh Exp $");

#include <sys/types.h>
#include <sys/systm.h>

#include <crypto/aes/aes.h>
#include <crypto/aes/arch/x86/aes_ni.h>

#include <x86/cpuvar.h>
#include <x86/fpu.h>
#include <x86/specialreg.h>

static void
aesni_setenckey(struct aesenc *enc, const uint8_t key[static 16],
    uint32_t nrounds)
{

	switch (nrounds) {
	case 10:
		aesni_setenckey128(enc, key);
		break;
	case 12:
		aesni_setenckey192(enc, key);
		break;
	case 14:
		aesni_setenckey256(enc, key);
		break;
	default:
		panic("invalid AES rounds: %u", nrounds);
	}
}

static void
aesni_setenckey_impl(struct aesenc *enc, const uint8_t key[static 16],
    uint32_t nrounds)
{

	fpu_kern_enter();
	aesni_setenckey(enc, key, nrounds);
	fpu_kern_leave();
}

static void
aesni_setdeckey_impl(struct aesdec *dec, const uint8_t key[static 16],
    uint32_t nrounds)
{
	struct aesenc enc;

	fpu_kern_enter();
	aesni_setenckey(&enc, key, nrounds);
	aesni_enctodec(&enc, dec, nrounds);
	fpu_kern_leave();

	explicit_memset(&enc, 0, sizeof enc);
}

static void
aesni_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
    uint8_t out[static 16], uint32_t nrounds)
{

	fpu_kern_enter();
	aesni_enc(enc, in, out, nrounds);
	fpu_kern_leave();
}

static void
aesni_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
    uint8_t out[static 16], uint32_t nrounds)
{

	fpu_kern_enter();
	aesni_dec(dec, in, out, nrounds);
	fpu_kern_leave();
}

static void
aesni_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    uint32_t nrounds)
{

	KASSERT(nbytes % 16 == 0);

	fpu_kern_enter();
	aesni_cbc_enc(enc, in, out, nbytes, iv, nrounds);
	fpu_kern_leave();
}

static void
aesni_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    uint32_t nrounds)
{

	KASSERT(nbytes % 16 == 0);

	fpu_kern_enter();

	if (nbytes % 128) {
		aesni_cbc_dec1(dec, in, out, nbytes % 128, iv, nrounds);
		in += nbytes % 128;
		out += nbytes % 128;
		nbytes -= nbytes % 128;
	}

	KASSERT(nbytes % 128 == 0);
	if (nbytes)
		aesni_cbc_dec8(dec, in, out, nbytes, iv, nrounds);

	fpu_kern_leave();
}

static void
aesni_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    uint32_t nrounds)
{

	KASSERT(nbytes % 16 == 0);

	fpu_kern_enter();

	if (nbytes % 128) {
		aesni_xts_enc1(enc, in, out, nbytes % 128, iv, nrounds);
		in += nbytes % 128;
		out += nbytes % 128;
		nbytes -= nbytes % 128;
	}

	KASSERT(nbytes % 128 == 0);
	if (nbytes)
		aesni_xts_enc8(enc, in, out, nbytes, iv, nrounds);

	fpu_kern_leave();
}

static void
aesni_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
    uint32_t nrounds)
{

	KASSERT(nbytes % 16 == 0);

	fpu_kern_enter();

	if (nbytes % 128) {
		aesni_xts_dec1(dec, in, out, nbytes % 128, iv, nrounds);
		in += nbytes % 128;
		out += nbytes % 128;
		nbytes -= nbytes % 128;
	}

	KASSERT(nbytes % 128 == 0);
	if (nbytes)
		aesni_xts_dec8(dec, in, out, nbytes, iv, nrounds);

	fpu_kern_leave();
}

static int
aesni_xts_update_selftest(void)
{
	static const struct {
		uint8_t	in[16], out[16];
	} cases[] = {
		{{1}, {2}},
		{{0,0,0,0x80}, {0,0,0,0,1}},
		{{0,0,0,0,0,0,0,0x80}, {0,0,0,0,0,0,0,0,1}},
		{{0,0,0,0x80,0,0,0,0x80}, {0,0,0,0,1,0,0,0,1}},
		{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x80}, {0x87}},
		{{0,0,0,0,0,0,0,0x80,0,0,0,0,0,0,0,0x80},
		 {0x87,0,0,0,0,0,0,0,1}},
		{{0,0,0,0x80,0,0,0,0,0,0,0,0,0,0,0,0x80}, {0x87,0,0,0,1}},
		{{0,0,0,0x80,0,0,0,0x80,0,0,0,0,0,0,0,0x80},
		 {0x87,0,0,0,1,0,0,0,1}},
	};
	unsigned i;
	uint8_t tweak[16];

	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
		aesni_xts_update(cases[i].in, tweak);
		if (memcmp(tweak, cases[i].out, 16))
			return -1;
	}

	/* Success!  */
	return 0;
}

static int
aesni_probe(void)
{
	int result = 0;

	/* Verify that the CPU supports AES-NI.  */
	if ((cpu_feature[1] & CPUID2_AES) == 0)
		return -1;

	fpu_kern_enter();

	/* Verify that our XTS tweak update logic works.  */
	if (aesni_xts_update_selftest())
		result = -1;

	fpu_kern_leave();

	return result;
}

struct aes_impl aes_ni_impl = {
	.ai_name = "Intel AES-NI",
	.ai_probe = aesni_probe,
	.ai_setenckey = aesni_setenckey_impl,
	.ai_setdeckey = aesni_setdeckey_impl,
	.ai_enc = aesni_enc_impl,
	.ai_dec = aesni_dec_impl,
	.ai_cbc_enc = aesni_cbc_enc_impl,
	.ai_cbc_dec = aesni_cbc_dec_impl,
	.ai_xts_enc = aesni_xts_enc_impl,
	.ai_xts_dec = aesni_xts_dec_impl,
};

File Added: src/sys/crypto/aes/arch/x86/aes_ni.h
/*	$NetBSD: aes_ni.h,v 1.1 2020/06/29 23:29:40 riastradh Exp $	*/

/*-
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef	_CRYPTO_AES_ARCH_X86_AES_NI_H
#define	_CRYPTO_AES_ARCH_X86_AES_NI_H

#include <sys/types.h>

#include <crypto/aes/aes.h>

/* Assembly routines */

void	aesni_setenckey128(struct aesenc *, const uint8_t[static 16]);
void	aesni_setenckey192(struct aesenc *, const uint8_t[static 24]);
void	aesni_setenckey256(struct aesenc *, const uint8_t[static 32]);

void	aesni_enctodec(const struct aesenc *, struct aesdec *, uint32_t);

void	aesni_enc(const struct aesenc *, const uint8_t[static 16],
	    uint8_t[static 16], uint32_t);
void	aesni_dec(const struct aesdec *, const uint8_t[static 16],
	    uint8_t[static 16], uint32_t);

void	aesni_cbc_enc(const struct aesenc *, const uint8_t[static 16],
	    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
void	aesni_cbc_dec1(const struct aesdec *, const uint8_t[static 16],
	    uint8_t[static 16], size_t, const uint8_t[static 16], uint32_t);
void	aesni_cbc_dec8(const struct aesdec *, const uint8_t[static 128],
	    uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);

void	aesni_xts_enc1(const struct aesenc *, const uint8_t[static 16],
	    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
void	aesni_xts_enc8(const struct aesenc *, const uint8_t[static 128],
	    uint8_t[static 128], size_t, uint8_t[static 16], uint32_t);
void	aesni_xts_dec1(const struct aesdec *, const uint8_t[static 16],
	    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
void	aesni_xts_dec8(const struct aesdec *, const uint8_t[static 128],
	    uint8_t[static 128], size_t, uint8_t[static 16], uint32_t);
void	aesni_xts_update(const uint8_t[static 16], uint8_t[static 16]);

extern struct aes_impl aes_ni_impl;

#endif	/* _CRYPTO_AES_ARCH_X86_AES_NI_H */

File Added: src/sys/crypto/aes/arch/x86/aes_ni_64.S
/*	$NetBSD: aes_ni_64.S,v 1.1 2020/06/29 23:29:40 riastradh Exp $	*/

/*-
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <machine/asm.h>

/*
 * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
 * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
 * Packed Single, defined to operate on binary32 floats.  They have
 * exactly the same architectural effects (move a 128-bit quantity from
 * memory into an xmm register).
 *
 * In principle, they might have different microarchitectural effects
 * so that MOVAPS/MOVUPS might incur a penalty when the register is
 * later used for integer paths, but in practice they don't.  So we use
 * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
 */
#define	movdqa	movaps
#define	movdqu	movups

/*
 * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
 *
 *	Expand a 16-byte AES-128 key into 10 round keys.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_setenckey128)
	movdqu	(%rsi),%xmm0	/* load master key into %xmm0 */
	movdqa	%xmm0,(%rdi)	/* store master key as the first round key */
	lea	0x10(%rdi),%rdi	/* advance %rdi to next round key */
	aeskeygenassist $0x1,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x2,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x4,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x8,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x10,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x20,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x40,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x80,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x1b,%xmm0,%xmm2
	call	aesni_expand128
	aeskeygenassist $0x36,%xmm0,%xmm2
	call	aesni_expand128
	ret
END(aesni_setenckey128)

/*
 * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
 *
 *	Expand a 24-byte AES-192 key into 12 round keys.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_setenckey192)
	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
	movq	0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
	lea	0x10(%rdi),%rdi /* advance %rdi to next round key */
	aeskeygenassist $0x1,%xmm1,%xmm2
	call	aesni_expand192a
	aeskeygenassist $0x2,%xmm0,%xmm2
	call	aesni_expand192b
	aeskeygenassist $0x4,%xmm1,%xmm2
	call	aesni_expand192a
	aeskeygenassist $0x8,%xmm0,%xmm2
	call	aesni_expand192b
	aeskeygenassist $0x10,%xmm1,%xmm2
	call	aesni_expand192a
	aeskeygenassist $0x20,%xmm0,%xmm2
	call	aesni_expand192b
	aeskeygenassist $0x40,%xmm1,%xmm2
	call	aesni_expand192a
	aeskeygenassist $0x80,%xmm0,%xmm2
	call	aesni_expand192b
	ret
END(aesni_setenckey192)

/*
 * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
 *
 *	Expand a 32-byte AES-256 key into 14 round keys.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_setenckey256)
	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
	movdqu	0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
	movdqa	%xmm1,0x10(%rdi) /* store master key [128:256) as round key */
	lea	0x20(%rdi),%rdi	/* advance %rdi to next round key */
	aeskeygenassist $0x1,%xmm1,%xmm2
	call	aesni_expand256a
	aeskeygenassist $0x1,%xmm0,%xmm2
	call	aesni_expand256b
	aeskeygenassist $0x2,%xmm1,%xmm2
	call	aesni_expand256a
	aeskeygenassist $0x2,%xmm0,%xmm2
	call	aesni_expand256b
	aeskeygenassist $0x4,%xmm1,%xmm2
	call	aesni_expand256a
	aeskeygenassist $0x4,%xmm0,%xmm2
	call	aesni_expand256b
	aeskeygenassist $0x8,%xmm1,%xmm2
	call	aesni_expand256a
	aeskeygenassist $0x8,%xmm0,%xmm2
	call	aesni_expand256b
	aeskeygenassist $0x10,%xmm1,%xmm2
	call	aesni_expand256a
	aeskeygenassist $0x10,%xmm0,%xmm2
	call	aesni_expand256b
	aeskeygenassist $0x20,%xmm1,%xmm2
	call	aesni_expand256a
	aeskeygenassist $0x20,%xmm0,%xmm2
	call	aesni_expand256b
	aeskeygenassist $0x40,%xmm1,%xmm2
	call	aesni_expand256a
	ret
END(aesni_setenckey256)

/*
 * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
 *     uint128_t keygenassist@xmm2)
 *
 *	1. Compute the AES-128 round key using the previous round key.
 *	2. Store it at *rkp.
 *	3. Set %xmm0 to it.
 *	4. Advance %rdi to point at the next round key.
 *
 *	Internal ABI.  On entry:
 *
 *		%rdi = rkp, pointer to round key to compute
 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
 *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
 *
 *	On exit:
 *
 *		%rdi = &rkp[1], rkp advanced by one round key
 *		%xmm0 = rk, the round key we just computed
 *		%xmm2 = garbage
 *		%xmm4 = garbage
 *		%xmm5 = garbage
 *		%xmm6 = garbage
 *
 *	Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
 *	and all other registers).
 */
	.text
	_ALIGN_TEXT
	.type	aesni_expand128,@function
aesni_expand128:
	/*
	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
	 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
	 */
	pshufd	$0b11111111,%xmm2,%xmm2

	/*
	 * %xmm4 := (0, prk[0], prk[1], prk[2])
	 * %xmm5 := (0, 0, prk[0], prk[1])
	 * %xmm6 := (0, 0, 0, prk[0])
	 */
	movdqa	%xmm0,%xmm4
	movdqa	%xmm0,%xmm5
	movdqa	%xmm0,%xmm6
	pslldq	$4,%xmm4
	pslldq	$8,%xmm5
	pslldq	$12,%xmm6

	/*
	 * %xmm0 := (rk[0] = t ^ prk[0],
	 *     rk[1] = t ^ prk[0] ^ prk[1],
	 *     rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
	 *     rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
	 */
	pxor	%xmm2,%xmm0
	pxor	%xmm4,%xmm0
	pxor	%xmm5,%xmm0
	pxor	%xmm6,%xmm0

	movdqa	%xmm0,(%rdi)	/* store round key */
	lea	0x10(%rdi),%rdi	/* advance to next round key address */
	ret
END(aesni_expand128)

/*
 * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
 *     uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
 *
 *	Set even-numbered AES-192 round key.
 *
 *	Internal ABI.  On entry:
 *
 *		%rdi = rkp, pointer to two round keys to compute
 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
 *		%xmm1 = (rklo[0], rklo[1], xxx, xxx)
 *		%xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
 *
 *	On exit:
 *
 *		%rdi = &rkp[2], rkp advanced by two round keys
 *		%xmm0 = nrk, second round key we just computed
 *		%xmm1 = rk, first round key we just computed
 *		%xmm2 = garbage
 *		%xmm4 = garbage
 *		%xmm5 = garbage
 *		%xmm6 = garbage
 *		%xmm7 = garbage
 */
	.text
	_ALIGN_TEXT
	.type	aesni_expand192a,@function
aesni_expand192a:
	/*
	 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
	 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
	 */
	pshufd	$0b01010101,%xmm2,%xmm2

	/*
	 * We need to compute:
	 *
	 * rk[0] := rklo[0]
	 * rk[1] := rklo[1]
	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
	 *     ^ rklo[1]
	 */

	/*
	 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
	 * %xmm5 := (0, prk[0], prk[1], prk[2])
	 * %xmm6 := (0, 0, prk[0], prk[1])
	 * %xmm7 := (0, 0, 0, prk[0])
	 */
	movdqa	%xmm0,%xmm4
	movdqa	%xmm0,%xmm5
	movdqa	%xmm0,%xmm6
	movdqa	%xmm0,%xmm7
	pslldq	$4,%xmm5
	pslldq	$8,%xmm6
	pslldq	$12,%xmm7

	/* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
	pxor	%xmm2,%xmm4
	pxor	%xmm5,%xmm4
	pxor	%xmm6,%xmm4
	pxor	%xmm7,%xmm4

	/*
	 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
	 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
	 * and we have yet to compute nrk[2] or nrk[3], which requires
	 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...).  We need
	 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
	 * nrk into %xmm0.
	 */

	/* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
	pshufd	$0b11111110,%xmm4,%xmm0

	/*
	 * %xmm6 := (0, 0, rklo[0], rklo[1])
	 * %xmm7 := (0, 0, 0, rklo[0])
	 */
	movdqa	%xmm1,%xmm6
	movdqa	%xmm1,%xmm7

	pslldq	$8,%xmm6
	pslldq	$12,%xmm7

	/*
	 * %xmm0 := (nrk[0],
	 *     nrk[1],
	 *     nrk[2] = nrk[1] ^ rklo[0],
	 *     nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
	 */
	pxor	%xmm6,%xmm0
	pxor	%xmm7,%xmm0

	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
	shufps	$0b01000100,%xmm4,%xmm1

	movdqa	%xmm1,(%rdi)		/* store round key */
	movdqa	%xmm0,0x10(%rdi)	/* store next round key */
	lea	0x20(%rdi),%rdi		/* advance two round keys */
	ret
END(aesni_expand192a)

/*
 * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
 *     uint128_t keygenassist@xmm2)
 *
 *	Set odd-numbered AES-192 round key.
 *
 *	Internal ABI.  On entry:
 *
 *		%rdi = rkp, pointer to round key to compute
 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
 *		%xmm1 = (xxx, xxx, pprk[2], pprk[3])
 *		%xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
 *
 *	On exit:
 *
 *		%rdi = &rkp[1], rkp advanced by one round key
 *		%xmm0 = rk, the round key we just computed
 *		%xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
 *		%xmm2 = garbage
 *		%xmm4 = garbage
 *		%xmm5 = garbage
 *		%xmm6 = garbage
 *		%xmm7 = garbage
 */
	.text
	_ALIGN_TEXT
	.type	aesni_expand192b,@function
aesni_expand192b:
	/*
	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
	 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
	 */
	pshufd	$0b11111111,%xmm2,%xmm2

	/*
	 * We need to compute:
	 *
	 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
	 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
	 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
	 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
	 *     ^ prk[1]
	 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
	 *     ^ prk[1] ^ prk[2]
	 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
	 *     ^ prk[1] ^ prk[2] ^ prk[3]
	 */

	/* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
	shufps	$0b01001110,%xmm0,%xmm1

	/*
	 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
	 * %xmm6 := (0, 0, pprk[2], pprk[3])
	 * %xmm7 := (0, 0, 0, pprk[2])
	 */
	movdqa	%xmm1,%xmm5
	movdqa	%xmm1,%xmm6
	movdqa	%xmm1,%xmm7
	pslldq	$4,%xmm5
	pslldq	$8,%xmm6
	pslldq	$12,%xmm7

	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
	pxor	%xmm2,%xmm1
	pxor	%xmm5,%xmm1
	pxor	%xmm6,%xmm1
	pxor	%xmm7,%xmm1

	/* %xmm4 := (prk[2], prk[3], xxx, xxx) */
	pshufd	$0b00001110,%xmm0,%xmm4

	/* %xmm5 := (0, prk[2], xxx, xxx) */
	movdqa	%xmm4,%xmm5
	pslldq	$4,%xmm5

	/* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
	movdqa	%xmm1,%xmm0

	/* %xmm1 := (rk[3], rk[3], xxx, xxx) */
	shufps	$0b00001111,%xmm1,%xmm1

	/*
	 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
	 *     nrk[1] = rk[3] ^ prk[2] ^ prk[3],
	 *     xxx,
	 *     xxx)
	 */
	pxor	%xmm4,%xmm1
	pxor	%xmm5,%xmm1

	movdqa	%xmm0,(%rdi)	/* store round key */
	lea	0x10(%rdi),%rdi	/* advance to next round key address */
	ret
END(aesni_expand192b)

/*
 * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
 *     uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
 *
 *	Set even-numbered AES-256 round key.
 *
 *	Internal ABI.  On entry:
 *
 *		%rdi = rkp, pointer to round key to compute
 *		%xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
 *		%xmm1 = (prk[0], prk[1], prk[2], prk[3])
 *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
 *
 *	On exit:
 *
 *		%rdi = &rkp[1], rkp advanced by one round key
 *		%xmm0 = rk, the round key we just computed
 *		%xmm1 = prk, previous round key, preserved from entry
 *		%xmm2 = garbage
 *		%xmm4 = garbage
 *		%xmm5 = garbage
 *		%xmm6 = garbage
 *
 *	The computation turns out to be the same as for AES-128; the
 *	previous round key does not figure into it, only the
 *	previous-previous round key.
 */
	aesni_expand256a = aesni_expand128

/*
 * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
 *     uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
 *
 *	Set odd-numbered AES-256 round key.
 *
 *	Internal ABI.  On entry:
 *
 *		%rdi = rkp, pointer to round key to compute
 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
 *		%xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
 *		%xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
 *
 *	On exit:
 *
 *		%rdi = &rkp[1], rkp advanced by one round key
 *		%xmm0 = prk, previous round key, preserved from entry
 *		%xmm1 = rk, the round key we just computed
 *		%xmm2 = garbage
 *		%xmm4 = garbage
 *		%xmm5 = garbage
 *		%xmm6 = garbage
 */
	.text
	_ALIGN_TEXT
	.type	aesni_expand256b,@function
aesni_expand256b:
	/*
	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
	 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
	 */
	pshufd	$0b10101010,%xmm2,%xmm2

	/*
	 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
	 * %xmm5 := (0, 0, pprk[0], pprk[1])
	 * %xmm6 := (0, 0, 0, pprk[0])
	 */
	movdqa	%xmm1,%xmm4
	movdqa	%xmm1,%xmm5
	movdqa	%xmm1,%xmm6
	pslldq	$4,%xmm4
	pslldq	$8,%xmm5
	pslldq	$12,%xmm6

	/*
	 * %xmm0 := (rk[0] = t ^ pprk[0],
	 *     rk[1] = t ^ pprk[0] ^ pprk[1],
	 *     rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
	 *     rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
	 */
	pxor	%xmm2,%xmm1
	pxor	%xmm4,%xmm1
	pxor	%xmm5,%xmm1
	pxor	%xmm6,%xmm1

	movdqa	%xmm1,(%rdi)	/* store round key */
	lea	0x10(%rdi),%rdi	/* advance to next round key address */
	ret
END(aesni_expand256b)

/*
 * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
 *     uint32_t nrounds@rdx)
 *
 *	Convert AES encryption round keys to AES decryption round keys.
 *	`rounds' must be between 10 and 14.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_enctodec)
	shl	$4,%edx		/* rdx := byte offset of last round key */
	movdqa	(%rdi,%rdx),%xmm0	/* load last round key */
	movdqa	%xmm0,(%rsi)	/* store last round key verbatim */
1:	sub	$0x10,%rdx	/* advance to next round key */
	lea	0x10(%rsi),%rsi
	jz	2f		/* stop if this is the last one */
	movdqa	(%rdi,%rdx),%xmm0	/* load round key */
	aesimc	%xmm0,%xmm0	/* convert encryption to decryption */
	movdqa	%xmm0,(%rsi)	/* store round key */
	jmp	1b
2:	movdqa	(%rdi),%xmm0	/* load first round key */
	movdqa	%xmm0,(%rsi)	/* store first round key verbatim */
	ret
END(aesni_enctodec)

/*
 * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
 *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
 *
 *	Encrypt a single block.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_enc)
	movdqu	(%rsi),%xmm0
	call	aesni_enc1
	movdqu	%xmm0,(%rdx)
	ret
END(aesni_enc)

/*
 * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
 *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
 *
 *	Decrypt a single block.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_dec)
	movdqu	(%rsi),%xmm0
	call	aesni_dec1
	movdqu	%xmm0,(%rdx)
	ret
END(aesni_dec)

/*
 * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
 *     uint32_t nrounds@r9d)
 *
 *	Encrypt a contiguous sequence of blocks with AES-CBC.
 *
 *	nbytes must be an integral multiple of 16.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_cbc_enc)
	cmp	$0,%rcx
	jz	2f
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	(%r8),%xmm0		/* xmm0 := chaining value */
1:	movdqu	(%rsi),%xmm1		/* xmm1 := plaintext block */
	lea	0x10(%rsi),%rsi
	pxor	%xmm1,%xmm0		/* xmm0 := cv ^ ptxt */
	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_enc1		/* xmm0 := ciphertext block */
	movdqu	%xmm0,(%rdx)
	lea	0x10(%rdx),%rdx
	sub	$0x10,%r10
	jnz	1b			/* repeat if r10 is nonzero */
	movdqu	%xmm0,(%r8)		/* store chaining value */
2:	ret
END(aesni_cbc_enc)

/*
 * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
 *     uint32_t nrounds@r9)
 *
 *	Decrypt a contiguous sequence of blocks with AES-CBC.
 *
 *	nbytes must be a positive integral multiple of 16.  This routine
 *	is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_cbc_dec1)
	push	%rbp			/* create stack frame uint128[1] */
	mov	%rsp,%rbp
	sub	$0x10,%rsp
	movdqu	(%r8),%xmm8		/* xmm8 := iv */
	movdqa	%xmm8,(%rsp)		/* save iv */
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	-0x10(%rsi,%r10),%xmm0	/* xmm0 := last ciphertext block */
	movdqu	%xmm0,(%r8)		/* update iv */
1:	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_dec1		/* xmm0 := cv ^ ptxt */
	sub	$0x10,%r10
	jz	2f			/* first block if r10 is now zero */
	movdqu	-0x10(%rsi,%r10),%xmm8	/* xmm8 := chaining value */
	pxor	%xmm8,%xmm0		/* xmm0 := ptxt */
	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
	movdqa	%xmm8,%xmm0		/* move cv = ciphertext block */
	jmp	1b
2:	pxor	(%rsp),%xmm0		/* xmm0 := ptxt */
	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
	leave
	ret
END(aesni_cbc_dec1)

/*
 * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
 *     uint32_t nrounds@r9)
 *
 *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
 *
 *	nbytes must be a positive integral multiple of 128.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_cbc_dec8)
	push	%rbp			/* create stack frame uint128[1] */
	mov	%rsp,%rbp
	sub	$0x10,%rsp
	movdqu	(%r8),%xmm8		/* xmm8 := iv */
	movdqa	%xmm8,(%rsp)		/* save iv */
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := ciphertext block[n-1] */
	movdqu	%xmm7,(%r8)		/* update iv */
1:	movdqu	-0x20(%rsi,%r10),%xmm6	/* xmm6 := ciphertext block[n-2] */
	movdqu	-0x30(%rsi,%r10),%xmm5	/* xmm5 := ciphertext block[n-3] */
	movdqu	-0x40(%rsi,%r10),%xmm4	/* xmm4 := ciphertext block[n-4] */
	movdqu	-0x50(%rsi,%r10),%xmm3	/* xmm3 := ciphertext block[n-5] */
	movdqu	-0x60(%rsi,%r10),%xmm2	/* xmm2 := ciphertext block[n-6] */
	movdqu	-0x70(%rsi,%r10),%xmm1	/* xmm1 := ciphertext block[n-7] */
	movdqu	-0x80(%rsi,%r10),%xmm0	/* xmm0 := ciphertext block[n-8] */
	movdqa	%xmm6,%xmm15		/* xmm[8+i] := cv[i], 0<i<8 */
	movdqa	%xmm5,%xmm14
	movdqa	%xmm4,%xmm13
	movdqa	%xmm3,%xmm12
	movdqa	%xmm2,%xmm11
	movdqa	%xmm1,%xmm10
	movdqa	%xmm0,%xmm9
	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_dec8		/* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
	pxor	%xmm15,%xmm7		/* xmm[i] := ptxt[i], 0<i<8 */
	pxor	%xmm14,%xmm6
	pxor	%xmm13,%xmm5
	pxor	%xmm12,%xmm4
	pxor	%xmm11,%xmm3
	pxor	%xmm10,%xmm2
	pxor	%xmm9,%xmm1
	movdqu	%xmm7,-0x10(%rdx,%r10)	/* store plaintext blocks */
	movdqu	%xmm6,-0x20(%rdx,%r10)
	movdqu	%xmm5,-0x30(%rdx,%r10)
	movdqu	%xmm4,-0x40(%rdx,%r10)
	movdqu	%xmm3,-0x50(%rdx,%r10)
	movdqu	%xmm2,-0x60(%rdx,%r10)
	movdqu	%xmm1,-0x70(%rdx,%r10)
	sub	$0x80,%r10
	jz	2f			/* first block if r10 is now zero */
	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := cv[0] */
	pxor	%xmm7,%xmm0		/* xmm0 := ptxt[0] */
	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
	jmp	1b
2:	pxor	(%rsp),%xmm0		/* xmm0 := ptxt[0] */
	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
	leave
	ret
END(aesni_cbc_dec8)

/*
 * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
 *     uint32_t nrounds@r9d)
 *
 *	Encrypt a contiguous sequence of blocks with AES-XTS.
 *
 *	nbytes must be a positive integral multiple of 16.  This routine
 *	is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_xts_enc1)
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
1:	movdqu	(%rsi),%xmm0		/* xmm0 := ptxt */
	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
	pxor	%xmm15,%xmm0		/* xmm0 := ptxt ^ tweak */
	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_enc1		/* xmm0 := AES(ptxt ^ tweak) */
	pxor	%xmm15,%xmm0		/* xmm0 := AES(ptxt ^ tweak) ^ tweak */
	movdqu	%xmm0,(%rdx)		/* store ciphertext block */
	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
	sub	$0x10,%r10
	jnz	1b			/* repeat if more blocks */
	movdqu	%xmm15,(%r8)		/* update tweak */
	ret
END(aesni_xts_enc1)

/*
 * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
 *     uint32_t nrounds@r9d)
 *
 *	Encrypt a contiguous sequence of blocks with AES-XTS.
 *
 *	nbytes must be a positive integral multiple of 128.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_xts_enc8)
	push	%rbp			/* create stack frame uint128[1] */
	mov	%rsp,%rbp
	sub	$0x10,%rsp
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
	call	aesni_xts_mulx		/* xmm15 := tweak[4] */
	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
	movdqu	0x10(%rsi),%xmm1
	movdqu	0x20(%rsi),%xmm2
	movdqu	0x30(%rsi),%xmm3
	movdqu	0x40(%rsi),%xmm4
	movdqu	0x50(%rsi),%xmm5
	movdqu	0x60(%rsi),%xmm6
	movdqu	0x70(%rsi),%xmm7
	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
	pxor	%xmm9,%xmm1
	pxor	%xmm10,%xmm2
	pxor	%xmm11,%xmm3
	pxor	%xmm12,%xmm4
	pxor	%xmm13,%xmm5
	pxor	%xmm14,%xmm6
	pxor	%xmm15,%xmm7
	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_enc8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
	pxor	%xmm9,%xmm1
	pxor	%xmm10,%xmm2
	pxor	%xmm11,%xmm3
	pxor	%xmm12,%xmm4
	pxor	%xmm13,%xmm5
	pxor	%xmm14,%xmm6
	pxor	%xmm15,%xmm7
	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
	movdqu	%xmm1,0x10(%rdx)
	movdqu	%xmm2,0x20(%rdx)
	movdqu	%xmm3,0x30(%rdx)
	movdqu	%xmm4,0x40(%rdx)
	movdqu	%xmm5,0x50(%rdx)
	movdqu	%xmm6,0x60(%rdx)
	movdqu	%xmm7,0x70(%rdx)
	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
	sub	$0x80,%r10
	jnz	1b			/* repeat if more block groups */
	movdqu	%xmm15,(%r8)		/* update tweak */
	leave
	ret
END(aesni_xts_enc8)

/*
 * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
 *     uint32_t nrounds@r9d)
 *
 *	Decrypt a contiguous sequence of blocks with AES-XTS.
 *
 *	nbytes must be a positive integral multiple of 16.  This routine
 *	is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_xts_dec1)
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
1:	movdqu	(%rsi),%xmm0		/* xmm0 := ctxt */
	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
	pxor	%xmm15,%xmm0		/* xmm0 := ctxt ^ tweak */
	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_dec1		/* xmm0 := AES(ctxt ^ tweak) */
	pxor	%xmm15,%xmm0		/* xmm0 := AES(ctxt ^ tweak) ^ tweak */
	movdqu	%xmm0,(%rdx)		/* store plaintext block */
	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
	sub	$0x10,%r10
	jnz	1b			/* repeat if more blocks */
	movdqu	%xmm15,(%r8)		/* update tweak */
	ret
END(aesni_xts_dec1)

/*
 * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
 *     uint32_t nrounds@r9d)
 *
 *	Decrypt a contiguous sequence of blocks with AES-XTS.
 *
 *	nbytes must be a positive integral multiple of 128.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_xts_dec8)
	push	%rbp			/* create stack frame uint128[1] */
	mov	%rsp,%rbp
	sub	$0x10,%rsp
	mov	%rcx,%r10		/* r10 := nbytes */
	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
1:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
	call	aesni_xts_mulx		/* xmm51 := tweak[4] */
	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
	movdqu	0x10(%rsi),%xmm1
	movdqu	0x20(%rsi),%xmm2
	movdqu	0x30(%rsi),%xmm3
	movdqu	0x40(%rsi),%xmm4
	movdqu	0x50(%rsi),%xmm5
	movdqu	0x60(%rsi),%xmm6
	movdqu	0x70(%rsi),%xmm7
	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
	pxor	%xmm9,%xmm1
	pxor	%xmm10,%xmm2
	pxor	%xmm11,%xmm3
	pxor	%xmm12,%xmm4
	pxor	%xmm13,%xmm5
	pxor	%xmm14,%xmm6
	pxor	%xmm15,%xmm7
	mov	%r9d,%ecx		/* ecx := nrounds */
	call	aesni_dec8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
	pxor	%xmm9,%xmm1
	pxor	%xmm10,%xmm2
	pxor	%xmm11,%xmm3
	pxor	%xmm12,%xmm4
	pxor	%xmm13,%xmm5
	pxor	%xmm14,%xmm6
	pxor	%xmm15,%xmm7
	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
	movdqu	%xmm1,0x10(%rdx)
	movdqu	%xmm2,0x20(%rdx)
	movdqu	%xmm3,0x30(%rdx)
	movdqu	%xmm4,0x40(%rdx)
	movdqu	%xmm5,0x50(%rdx)
	movdqu	%xmm6,0x60(%rdx)
	movdqu	%xmm7,0x70(%rdx)
	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
	sub	$0x80,%r10
	jnz	1b			/* repeat if more block groups */
	movdqu	%xmm15,(%r8)		/* update tweak */
	leave
	ret
END(aesni_xts_dec8)

/*
 * aesni_xts_mulx(tweak@xmm15)
 *
 *	Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
 *	Uses %xmm0 as temporary.
 */
	.text
	_ALIGN_TEXT
	.type	aesni_xts_mulx,@function
aesni_xts_mulx:
	/*
	 * Simultaneously determine
	 * (a) whether the high bit of the low quadword must be
	 *     shifted into the low bit of the high quadword, and
	 * (b) whether the high bit of the high quadword must be
	 *     carried into x^128 = x^7 + x^2 + x + 1.
	 */
	pxor	%xmm0,%xmm0	/* xmm0 := 0 */
	pcmpgtq	%xmm15,%xmm0	/* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
	pshufd	$0b01001110,%xmm0,%xmm0	/* swap halves of xmm0 */
	pand	xtscarry(%rip),%xmm0	/* copy xtscarry according to mask */
	psllq	$1,%xmm15	/* shift */
	pxor	%xmm0,%xmm15	/* incorporate (a) and (b) */
	ret
END(aesni_xts_mulx)

	.section .rodata
	.align 16
	.type	xtscarry,@object
xtscarry:
	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
END(xtscarry)

/*
 * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
 *
 *	Update an AES-XTS tweak.
 *
 *	Standard ABI calling convention.
 */
ENTRY(aesni_xts_update)
	movdqu	(%rdi),%xmm15
	call	aesni_xts_mulx
	movdqu	%xmm15,(%rsi)
	ret
END(aesni_xts_update)

/*
 * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
 *     uint32_t nrounds@ecx)
 *
 *	Encrypt a single AES block in %xmm0.
 *
 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
 */
	.text
	_ALIGN_TEXT
	.type	aesni_enc1,@function
aesni_enc1:
	pxor	(%rdi),%xmm0	/* xor in first round key */
	shl	$4,%ecx		/* ecx := total byte size of round keys */
	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
	neg	%rcx		/* rcx := byte offset of round key from end */
1:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
	add	$0x10,%rcx
	jz	2f		/* stop if this is the last one */
	aesenc	%xmm8,%xmm0
	jmp	1b
2:	aesenclast %xmm8,%xmm0
	ret
END(aesni_enc1)

/*
 * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
 *     block7@xmm7, uint32_t nrounds@ecx)
 *
 *	Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
 *
 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
 */
	.text
	_ALIGN_TEXT
	.type	aesni_enc8,@function
aesni_enc8:
	movdqa	(%rdi),%xmm8	/* xor in first round key */
	pxor	%xmm8,%xmm0
	pxor	%xmm8,%xmm1
	pxor	%xmm8,%xmm2
	pxor	%xmm8,%xmm3
	pxor	%xmm8,%xmm4
	pxor	%xmm8,%xmm5
	pxor	%xmm8,%xmm6
	pxor	%xmm8,%xmm7
	shl	$4,%ecx		/* ecx := total byte size of round keys */
	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
	neg	%rcx		/* rcx := byte offset of round key from end */
1:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
	add	$0x10,%rcx
	jz	2f		/* stop if this is the last one */
	aesenc	%xmm8,%xmm0
	aesenc	%xmm8,%xmm1
	aesenc	%xmm8,%xmm2
	aesenc	%xmm8,%xmm3
	aesenc	%xmm8,%xmm4
	aesenc	%xmm8,%xmm5
	aesenc	%xmm8,%xmm6
	aesenc	%xmm8,%xmm7
	jmp	1b
2:	aesenclast %xmm8,%xmm0
	aesenclast %xmm8,%xmm1
	aesenclast %xmm8,%xmm2
	aesenclast %xmm8,%xmm3
	aesenclast %xmm8,%xmm4
	aesenclast %xmm8,%xmm5
	aesenclast %xmm8,%xmm6
	aesenclast %xmm8,%xmm7
	ret
END(aesni_enc8)

/*
 * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
 *     uint32_t nrounds@ecx)
 *
 *	Decrypt a single AES block in %xmm0.
 *
 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
 */
	.text
	_ALIGN_TEXT
	.type	aesni_dec1,@function
aesni_dec1:
	pxor	(%rdi),%xmm0	/* xor in first round key */
	shl	$4,%ecx		/* ecx := byte offset of round key */
	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
	neg	%rcx		/* rcx := byte offset of round key from end */
1:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
	add	$0x10,%rcx
	jz	2f		/* stop if this is the last one */
	aesdec	%xmm8,%xmm0
	jmp	1b
2:	aesdeclast %xmm8,%xmm0
	ret
END(aesni_dec1)

/*
 * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
 *     block7@xmm7, uint32_t nrounds@ecx)
 *
 *	Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
 *
 *	Internal ABI.  Uses %xmm8 as temporary.  Destroys %rcx.
 */
	.text
	_ALIGN_TEXT
	.type	aesni_dec8,@function
aesni_dec8:
	movdqa	(%rdi),%xmm8	/* xor in first round key */
	pxor	%xmm8,%xmm0
	pxor	%xmm8,%xmm1
	pxor	%xmm8,%xmm2
	pxor	%xmm8,%xmm3
	pxor	%xmm8,%xmm4
	pxor	%xmm8,%xmm5
	pxor	%xmm8,%xmm6
	pxor	%xmm8,%xmm7
	shl	$4,%ecx		/* ecx := byte offset of round key */
	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
	neg	%rcx		/* rcx := byte offset of round key from end */
1:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
	add	$0x10,%rcx
	jz	2f		/* stop if this is the last one */
	aesdec	%xmm8,%xmm0
	aesdec	%xmm8,%xmm1
	aesdec	%xmm8,%xmm2
	aesdec	%xmm8,%xmm3
	aesdec	%xmm8,%xmm4
	aesdec	%xmm8,%xmm5
	aesdec	%xmm8,%xmm6
	aesdec	%xmm8,%xmm7
	jmp	1b
2:	aesdeclast %xmm8,%xmm0
	aesdeclast %xmm8,%xmm1
	aesdeclast %xmm8,%xmm2
	aesdeclast %xmm8,%xmm3
	aesdeclast %xmm8,%xmm4
	aesdeclast %xmm8,%xmm5
	aesdeclast %xmm8,%xmm6
	aesdeclast %xmm8,%xmm7
	ret
END(aesni_dec8)

File Added: src/sys/crypto/aes/arch/x86/files.aesni
#	$NetBSD: files.aesni,v 1.1 2020/06/29 23:29:40 riastradh Exp $

ifdef amd64	# amd64-only for now; i386 left as exercise for reader
file	crypto/aes/arch/x86/aes_ni.c		aes
file	crypto/aes/arch/x86/aes_ni_64.S		aes
endif