@@ -1,17 +1,17 @@
-# $NetBSD: Makefile,v 1.8 2019/08/30 13:16:27 bouyer Exp $
+# $NetBSD: Makefile,v 1.9 2019/11/13 13:36:11 bouyer Exp $
 VERSION=	4.11.2
-#PKGREVISION=	0
+PKGREVISION=	1
 DISTNAME=	xen-${VERSION}
 PKGNAME=	xenkernel411-${VERSION}
 CATEGORIES=	sysutils
 MASTER_SITES=	https://downloads.xenproject.org/release/xen/${VERSION}/
 DIST_SUBDIR=	xen411
 MAINTAINER=	bouyer@NetBSD.org
 HOMEPAGE=	https://xenproject.org/
 COMMENT=	Xen 4.11.x Kernel
 LICENSE=	gnu-gpl-v2
 ONLY_FOR_PLATFORM=	Linux-2.6*-x86_64

 @@ -1,13 +1,17 @@
-$NetBSD: distinfo,v 1.5 2019/08/30 13:16:27 bouyer Exp $
+$NetBSD: distinfo,v 1.6 2019/11/13 13:36:11 bouyer Exp $
 SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce
 RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b
 SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e
 Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes
 SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
 SHA1 (patch-XSA298) = 63e0f96ce3b945b16b98b51b423bafec14cf2be6
 SHA1 (patch-XSA302) = 12fbb7dfea27f53c70c8115487a2e30595549c2b
 SHA1 (patch-XSA304) = f2c22732227e11a3e77c630f0264a689eed53399
 SHA1 (patch-XSA305) = eb5e0096cbf501fcbd7a5c5f9d1f932b557636b6
 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
 SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
 SHA1 (patch-xen_arch_x86_boot_build32.mk) = b82c20de9b86ddaa9d05bbc1ff28f970eb78473c
 SHA1 (patch-xen_tools_symbols.c) = 6070b3b5ccc38a196283cfc1c52f5d87858beb18
 SHA1 (patch-zz-bouyer) = bf11b2b81d5c81992c911f670e75dd3aec5ab609

$NetBSD: patch-XSA298,v 1.1 2019/11/13 13:36:11 bouyer Exp $ From: Jan Beulich <jbeulich@suse.com> Subject: x86/PV: check GDT/LDT limits during emulation Accesses beyond the LDT limit originating from emulation would trigger the ASSERT() in pv_map_ldt_shadow_page(). On production builds such accesses would cause an attempt to promote the touched page (offset from the present LDT base address) to a segment descriptor one. If this happens to succeed, guest user mode would be able to elevate its privileges to that of the guest kernel. This is particularly easy when there's no LDT at all, in which case the LDT base stored internally to Xen is simply zero. Also adjust the ASSERT() that was triggering: It was off by one to begin with, and for production builds we also better use ASSERT_UNREACHABLE() instead with suitable recovery code afterwards. This is XSA-298. Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- xen/arch/x86/pv/emul-gate-op.c.orig +++ xen/arch/x86/pv/emul-gate-op.c @@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel); if ( (gate_sel < 4) || - ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || + /* + * We're interested in call gates only, which occupy a single + * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit. + */ + ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >= + (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents + : v->arch.pv_vcpu.gdt_ents)) || __get_user(desc, pdesc) ) return 0; @@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned if ( !is_pv_32bit_vcpu(v) ) { if ( (*ar & 0x1f00) != 0x0c00 || - (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || + /* Limit check done above already. */ __get_user(desc, pdesc + 1) || (desc.b & 0x1f00) ) return 0; --- xen/arch/x86/pv/emulate.c.orig +++ xen/arch/x86/pv/emulate.c @@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int { struct desc_struct desc; - if ( sel < 4) + if ( sel < 4 || + /* + * Don't apply the GDT limit here, as the selector may be a Xen + * provided one. __get_user() will fail (without taking further + * action) for ones falling in the gap between guest populated + * and Xen ones. + */ + ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) ) desc.b = desc.a = 0; else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) ) return 0; --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int BUG_ON(unlikely(in_irq())); /* - * Hardware limit checking should guarantee this property. NB. This is + * Prior limit checking should guarantee this property. NB. This is * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the * current vcpu, and vcpu_reset() will block until this vcpu has been * descheduled before continuing. */ - ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents); + if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) ) + { + ASSERT_UNREACHABLE(); + return false; + } if ( is_pv_32bit_domain(currd) ) linear = (uint32_t)linear;

$NetBSD: patch-XSA302,v 1.1 2019/11/13 13:36:11 bouyer Exp $ From bbca29f88d9ad9c7e91125a3b5d5f13a23e5801f Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> Date: Wed, 2 Oct 2019 13:36:59 +0200 Subject: [PATCH 1/2] IOMMU: add missing HVM check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix an unguarded d->arch.hvm access in assign_device(). Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> (cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6) Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> --- xen/drivers/passthrough/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c index f51cae7f4e..037aba7c94 100644 --- xen/drivers/passthrough/pci.c.orig +++ xen/drivers/passthrough/pci.c @@ -1416,7 +1416,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) /* Prevent device assign if mem paging or mem sharing have been * enabled for this domain */ if ( unlikely(!need_iommu(d) && - (d->arch.hvm_domain.mem_sharing_enabled || + ((is_hvm_domain(d) && + d->arch.hvm_domain.mem_sharing_enabled) || vm_event_check_ring(d->vm_event_paging) || p2m_get_hostp2m(d)->global_logdirty)) ) return -EXDEV; -- 2.11.0 From ec99857f59f7f06236f11ca8b0b2303e5e745cc4 Mon Sep 17 00:00:00 2001 From: Paul Durrant <paul.durrant@citrix.com> Date: Mon, 14 Oct 2019 17:52:59 +0100 Subject: [PATCH 2/2] passthrough: quarantine PCI devices When a PCI device is assigned to an untrusted domain, it is possible for that domain to program the device to DMA to an arbitrary address. The IOMMU is used to protect the host from malicious DMA by making sure that the device addresses can only target memory assigned to the guest. However, when the guest domain is torn down the device is assigned back to dom0, thus allowing any in-flight DMA to potentially target critical host data. This patch introduces a 'quarantine' for PCI devices using dom_io. When the toolstack makes a device assignable (by binding it to pciback), it will now also assign it to DOMID_IO and the device will only be assigned back to dom0 when the device is made unassignable again. Whilst device is assignable it will only ever transfer between dom_io and guest domains. dom_io is actually only used as a sentinel domain for quarantining purposes; it is not configured with any IOMMU mappings. Assignment to dom_io simply means that the device's initiator (requestor) identifier is not present in the IOMMU's device table and thus any DMA transactions issued will be terminated with a fault condition. In addition, a fix to assignment handling is made for VT-d. Failure during the assignment step should not lead to a device still being associated with its prior owner. Hand the device to DomIO temporarily, until the assignment step has completed successfully. Remove the PI hooks from the source domain then earlier as well. Failure of the recovery reassign_device_ownership() may not go silent: There e.g. may still be left over RMRR mappings in the domain assignment to which has failed, and hence we can't allow that domain to continue executing. NOTE: This patch also includes one printk() cleanup; the "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(), since similar printk()-s elsewhere also don't log such a tag. This is XSA-302. Signed-off-by: Paul Durrant <paul.durrant@citrix.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> --- tools/libxl/libxl_pci.c | 25 +++++++++++- xen/arch/x86/mm.c | 2 + xen/common/domctl.c | 14 ++++++- xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++- xen/drivers/passthrough/iommu.c | 9 +++++ xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++------- xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++--- xen/include/xen/pci.h | 3 ++ 8 files changed, 138 insertions(+), 24 deletions(-) diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c index 4755a0c93c..81890a91ac 100644 --- tools/libxl/libxl_pci.c.orig +++ tools/libxl/libxl_pci.c @@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, libxl_device_pci *pcidev, int rebind) { + libxl_ctx *ctx = libxl__gc_owner(gc); unsigned dom, bus, dev, func; char *spath, *driver_path = NULL; int rc; @@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, } if ( rc ) { LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func); - return 0; + goto quarantine; } /* Check to see if there's already a driver that we need to unbind from */ @@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, return ERROR_FAIL; } +quarantine: + /* + * DOMID_IO is just a sentinel domain, without any actual mappings, + * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being + * unnecessarily denied. + */ + rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev), + XEN_DOMCTL_DEV_RDM_RELAXED); + if ( rc < 0 ) { + LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func); + return ERROR_FAIL; + } + return 0; } @@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc, libxl_device_pci *pcidev, int rebind) { + libxl_ctx *ctx = libxl__gc_owner(gc); int rc; char *driver_path; + /* De-quarantine */ + rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev)); + if ( rc < 0 ) { + LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus, + pcidev->dev, pcidev->func); + return ERROR_FAIL; + } + /* Unbind from pciback */ if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) { return ERROR_FAIL; diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index e6a4cb28f8..c1ab57f9a5 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -295,9 +295,11 @@ void __init arch_init_memory(void) * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. + * Quarantined PCI devices will be associated with this domain. */ dom_io = domain_create(DOMID_IO, NULL); BUG_ON(IS_ERR(dom_io)); + INIT_LIST_HEAD(&dom_io->arch.pdev_list); /* * Initialise our COW domain. diff --git a/xen/common/domctl.c b/xen/common/domctl.c index 9b7bc083ee..741d774cd1 100644 --- xen/common/domctl.c.orig +++ xen/common/domctl.c @@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) switch ( op->cmd ) { + case XEN_DOMCTL_assign_device: + case XEN_DOMCTL_deassign_device: + if ( op->domain == DOMID_IO ) + { + d = dom_io; + break; + } + else if ( op->domain == DOMID_INVALID ) + return -ESRCH; + /* fall through */ case XEN_DOMCTL_test_assign_device: if ( op->domain == DOMID_INVALID ) { @@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) if ( !domctl_lock_acquire() ) { - if ( d ) + if ( d && d != dom_io ) rcu_unlock_domain(d); return hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); @@ -1148,7 +1158,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) domctl_lock_release(); domctl_out_unlock_domonly: - if ( d ) + if ( d && d != dom_io ) rcu_unlock_domain(d); if ( copyback && __copy_to_guest(u_domctl, op, 1) ) diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c index 12d2695b89..ec8baae717 100644 --- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig +++ xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device( u8 bus = pdev->bus; const struct domain_iommu *hd = dom_iommu(domain); + /* dom_io is used as a sentinel for quarantined devices */ + if ( domain == dom_io ) + return; + BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || !iommu->dev_table.buffer ); @@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain, int req_id; u8 bus = pdev->bus; + /* dom_io is used as a sentinel for quarantined devices */ + if ( domain == dom_io ) + return; + BUG_ON ( iommu->dev_table.buffer == NULL ); req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); @@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn, ivrs_mappings[req_id].read_permission); } - return reassign_device(hardware_domain, d, devfn, pdev); + return reassign_device(pdev->domain, d, devfn, pdev); } static void deallocate_next_page_table(struct page_info *pg, int level) diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c index 04b0be37d3..8027d96f1c 100644 --- xen/drivers/passthrough/iommu.c.orig +++ xen/drivers/passthrough/iommu.c @@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d) { const struct domain_iommu *hd = dom_iommu(d); + if ( d == dom_io ) + return; + d->need_iommu = 0; hd->platform_ops->teardown(d); tasklet_schedule(&iommu_pt_cleanup_tasklet); @@ -229,6 +232,9 @@ int iommu_construct(struct domain *d) if ( need_iommu(d) > 0 ) return 0; + if ( d == dom_io ) + return 0; + if ( !iommu_use_hap_pt(d) ) { int rc; @@ -404,6 +410,9 @@ int __init iommu_setup(void) printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); if ( iommu_enabled ) { + if ( iommu_domain_init(dom_io) ) + panic("Could not set up quarantine\n"); + printk(" - Dom0 mode: %s\n", iommu_passthrough ? "Passthrough" : iommu_dom0_strict ? "Strict" : "Relaxed"); diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c index 037aba7c94..fb010a547b 100644 --- xen/drivers/passthrough/pci.c.orig +++ xen/drivers/passthrough/pci.c @@ -1389,19 +1389,29 @@ static int iommu_remove_device(struct pci_dev *pdev) return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev)); } -/* - * If the device isn't owned by the hardware domain, it means it already - * has been assigned to other domain, or it doesn't exist. - */ static int device_assigned(u16 seg, u8 bus, u8 devfn) { struct pci_dev *pdev; + int rc = 0; pcidevs_lock(); - pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); + + pdev = pci_get_pdev(seg, bus, devfn); + + if ( !pdev ) + rc = -ENODEV; + /* + * If the device exists and it is not owned by either the hardware + * domain or dom_io then it must be assigned to a guest, or be + * hidden (owned by dom_xen). + */ + else if ( pdev->domain != hardware_domain && + pdev->domain != dom_io ) + rc = -EBUSY; + pcidevs_unlock(); - return pdev ? 0 : -EBUSY; + return rc; } static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) @@ -1415,7 +1425,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) /* Prevent device assign if mem paging or mem sharing have been * enabled for this domain */ - if ( unlikely(!need_iommu(d) && + if ( d != dom_io && + unlikely(!need_iommu(d) && ((is_hvm_domain(d) && d->arch.hvm_domain.mem_sharing_enabled) || vm_event_check_ring(d->vm_event_paging) || @@ -1432,12 +1443,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) return rc; } - pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); + pdev = pci_get_pdev(seg, bus, devfn); + + rc = -ENODEV; if ( !pdev ) - { - rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; goto done; - } + + rc = 0; + if ( d == pdev->domain ) + goto done; + + rc = -EBUSY; + if ( pdev->domain != hardware_domain && + pdev->domain != dom_io ) + goto done; if ( pdev->msix ) msixtbl_init(d); @@ -1460,6 +1479,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) } done: + /* The device is assigned to dom_io so mark it as quarantined */ + if ( !rc && d == dom_io ) + pdev->quarantine = true; + if ( !has_arch_pdevs(d) && need_iommu(d) ) iommu_teardown(d); pcidevs_unlock(); @@ -1472,6 +1495,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) { const struct domain_iommu *hd = dom_iommu(d); struct pci_dev *pdev = NULL; + struct domain *target; int ret = 0; if ( !iommu_enabled || !hd->platform_ops ) @@ -1482,12 +1506,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) if ( !pdev ) return -ENODEV; + /* De-assignment from dom_io should de-quarantine the device */ + target = (pdev->quarantine && pdev->domain != dom_io) ? + dom_io : hardware_domain; + while ( pdev->phantom_stride ) { devfn += pdev->phantom_stride; if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) break; - ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, + ret = hd->platform_ops->reassign_device(d, target, devfn, pci_to_dev(pdev)); if ( !ret ) continue; @@ -1498,7 +1526,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) } devfn = pdev->devfn; - ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, + ret = hd->platform_ops->reassign_device(d, target, devfn, pci_to_dev(pdev)); if ( ret ) { @@ -1508,6 +1536,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) return ret; } + if ( pdev->domain == hardware_domain ) + pdev->quarantine = false; + pdev->fault.count = 0; if ( !has_arch_pdevs(d) && need_iommu(d) ) @@ -1686,7 +1717,7 @@ int iommu_do_pci_domctl( ret = hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl); else if ( ret ) - printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " + printk(XENLOG_G_ERR "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d->domain_id, ret); diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c index 4c719d4ee7..19f7d13013 100644 --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -1338,6 +1338,10 @@ int domain_context_mapping_one( int agaw, rc, ret; bool_t flush_dev_iotlb; + /* dom_io is used as a sentinel for quarantined devices */ + if ( domain == dom_io ) + return 0; + ASSERT(pcidevs_locked()); spin_lock(&iommu->lock); maddr = bus_to_context_maddr(iommu, bus); @@ -1573,6 +1577,10 @@ int domain_context_unmap_one( int iommu_domid, rc, ret; bool_t flush_dev_iotlb; + /* dom_io is used as a sentinel for quarantined devices */ + if ( domain == dom_io ) + return 0; + ASSERT(pcidevs_locked()); spin_lock(&iommu->lock); @@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, goto out; } + /* dom_io is used as a sentinel for quarantined devices */ + if ( domain == dom_io ) + goto out; + /* * if no other devices under the same iommu owned by this domain, * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp @@ -2389,6 +2401,15 @@ static int reassign_device_ownership( if ( ret ) return ret; + if ( devfn == pdev->devfn ) + { + list_move(&pdev->domain_list, &dom_io->arch.pdev_list); + pdev->domain = dom_io; + } + + if ( !has_arch_pdevs(source) ) + vmx_pi_hooks_deassign(source); + if ( !has_arch_pdevs(target) ) vmx_pi_hooks_assign(target); @@ -2407,15 +2428,13 @@ static int reassign_device_ownership( pdev->domain = target; } - if ( !has_arch_pdevs(source) ) - vmx_pi_hooks_deassign(source); - return ret; } static int intel_iommu_assign_device( struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) { + struct domain *s = pdev->domain; struct acpi_rmrr_unit *rmrr; int ret = 0, i; u16 bdf, seg; @@ -2458,8 +2477,8 @@ static int intel_iommu_assign_device( } } - ret = reassign_device_ownership(hardware_domain, d, devfn, pdev); - if ( ret ) + ret = reassign_device_ownership(s, d, devfn, pdev); + if ( ret || d == dom_io ) return ret; /* Setup rmrr identity mapping */ @@ -2472,11 +2491,20 @@ static int intel_iommu_assign_device( ret = rmrr_identity_mapping(d, 1, rmrr, flag); if ( ret ) { - reassign_device_ownership(d, hardware_domain, devfn, pdev); + int rc; + + rc = reassign_device_ownership(d, s, devfn, pdev); printk(XENLOG_G_ERR VTDPREFIX " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", rmrr->base_address, rmrr->end_address, d->domain_id, ret); + if ( rc ) + { + printk(XENLOG_ERR VTDPREFIX + " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); + domain_crash(d); + } break; } } diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h index 4cfa774615..066364bdef 100644 --- xen/include/xen/pci.h.orig +++ xen/include/xen/pci.h @@ -88,6 +88,9 @@ struct pci_dev { nodeid_t node; /* NUMA node */ + /* Device to be quarantined, don't automatically re-assign to dom0 */ + bool quarantine; + enum pdev_type { DEV_TYPE_PCI_UNKNOWN, DEV_TYPE_PCIe_ENDPOINT, -- 2.11.0

$NetBSD: patch-XSA304,v 1.1 2019/11/13 13:36:11 bouyer Exp $ From: Andrew Cooper <andrew.cooper3@citrix.com> Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and an EPT superpage gets shattered. The root cause is still under investigation, but the end result is unusable in combination with CVE-2018-12207 protections. This is part of XSA-304 / CVE-2018-12207 Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h index fb7edfaef9..d698b1d50a 100644 --- xen/drivers/passthrough/vtd/extern.h.orig +++ xen/drivers/passthrough/vtd/extern.h @@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu); int __must_check me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map); void pci_vtd_quirk(const struct pci_dev *); +void quirk_iommu_caps(struct iommu *iommu); + bool_t platform_supports_intremap(void); bool_t platform_supports_x2apic(void); diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c index f242e30caf..8712d3b4dc 100644 --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -1211,6 +1211,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) return -ENODEV; + quirk_iommu_caps(iommu); + if ( cap_fault_reg_offset(iommu->cap) + cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c index d6db862678..b02688e316 100644 --- xen/drivers/passthrough/vtd/quirks.c.orig +++ xen/drivers/passthrough/vtd/quirks.c @@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev) break; } } + +void __init quirk_iommu_caps(struct iommu *iommu) +{ + /* + * IOMMU Quirks: + * + * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't + * implement superpages internally. + * + * There are issues changing the walk length under in-flight DMA, which + * has manifested as incompatibility between EPT/IOMMU sharing and the + * workaround for CVE-2018-12207 / XSA-304. Hide the superpages + * capabilities in the IOMMU, which will prevent Xen from sharing the EPT + * and IOMMU pagetables. + * + * Detection of SandyBridge unfortunately has to be done by processor + * model because the client parts don't expose their IOMMUs as PCI devices + * we could match with a Device ID. + */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model == 0x2a || + boot_cpu_data.x86_model == 0x2d) ) + iommu->cap &= ~(0xful << 34); +} From: Andrew Cooper <andrew.cooper3@citrix.com> Subject: x86/vtx: Disable executable EPT superpages to work around CVE-2018-12207 CVE-2018-12207 covers a set of errata on various Intel processors, whereby a machine check exception can be generated in a corner case when an executable mapping changes size or cacheability without TLB invalidation. HVM guest kernels can trigger this to DoS the host. To mitigate, in affected hardware, all EPT superpages are marked NX. When an instruction fetch violation is observed against the superpage, the superpage is shattered to 4k and has execute permissions restored. This prevents the guest kernel from being able to create the necessary preconditions in the iTLB to exploit the vulnerability. This does come with a workload-dependent performance overhead, caused by increased TLB pressure. Performance can be restored, if guest kernels are trusted not to mount an attack, by specifying ept=exec-sp on the command line. This is part of XSA-304 / CVE-2018-12207 Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Acked-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index c63a07d29b..684671cb7b 100644 --- docs/misc/xen-command-line.markdown.orig +++ docs/misc/xen-command-line.markdown @@ -828,7 +828,7 @@ effect the inverse meaning. >> set as UC. ### ept (Intel) -> `= List of ( {no-}pml | {no-}ad )` +> `= List of [ {no-}pml, {no-}ad, {no-}exec-sp ]` Controls EPT related features. @@ -851,6 +851,16 @@ Controls EPT related features. >> Have hardware keep accessed/dirty (A/D) bits updated. +* The `exec-sp` boolean controls whether EPT superpages with execute + permissions are permitted. In general this is good for performance. + + However, on processors vulnerable CVE-2018-12207, HVM guest kernels can + use executable superpages to crash the host. By default, executable + superpages are disabled on affected hardware. + + If HVM guest kernels are trusted not to mount a DoS against the system, + this option can enabled to regain performance. + ### extra\_guest\_irqs > `= [<domU number>][,<dom0 number>]` diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index f4a6a37149..1924434960 100644 --- xen/arch/x86/hvm/hvm.c.orig +++ xen/arch/x86/hvm/hvm.c @@ -1706,6 +1706,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, struct p2m_domain *p2m, *hostp2m; int rc, fall_through = 0, paged = 0; int sharing_enomem = 0; + unsigned int page_order = 0; vm_event_request_t *req_ptr = NULL; bool_t ap2m_active, sync = 0; @@ -1774,7 +1775,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, hostp2m = p2m_get_hostp2m(currd); mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma, P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0), - NULL); + &page_order); if ( ap2m_active ) { @@ -1786,7 +1787,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, goto out; } - mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL); + mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order); } else p2m = hostp2m; @@ -1828,6 +1829,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, break; } + /* + * Workaround for XSA-304 / CVE-2018-12207. If we take an execution + * fault against a non-executable superpage, shatter it to regain + * execute permissions. + */ + if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation ) + { + int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K, + p2mt, p2ma); + + if ( res ) + printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n", + gfn, res); + + rc = !res; + goto out_put_gfn; + } + if ( violation ) { /* Should #VE be emulated for this fault? */ diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 493986e84a..8821a3b536 100644 --- xen/arch/x86/hvm/vmx/vmcs.c.orig +++ xen/arch/x86/hvm/vmx/vmcs.c @@ -67,6 +67,7 @@ integer_param("ple_window", ple_window); static bool_t __read_mostly opt_pml_enabled = 1; static s8 __read_mostly opt_ept_ad = -1; +int8_t __read_mostly opt_ept_exec_sp = -1; /* * The 'ept' parameter controls functionalities that depend on, or impact the @@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s) opt_pml_enabled = val; else if ( !cmdline_strcmp(s, "ad") ) opt_ept_ad = val; + else if ( !cmdline_strcmp(s, "exec-sp") ) + opt_ept_exec_sp = val; else rc = -EINVAL; diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 840dc2b44d..a568d62643 100644 --- xen/arch/x86/hvm/vmx/vmx.c.orig +++ xen/arch/x86/hvm/vmx/vmx.c @@ -2415,6 +2415,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs) static void __init lbr_tsx_fixup_check(void); static void __init bdw_erratum_bdf14_fixup_check(void); +/* + * Calculate whether the CPU is vulnerable to Instruction Fetch page + * size-change MCEs. + */ +static bool __init has_if_pschange_mc(void) +{ + uint64_t caps = 0; + + /* + * If we are virtualised, there is nothing we can do. Our EPT tables are + * shadowed by our hypervisor, and not walked by hardware. + */ + if ( cpu_has_hypervisor ) + return false; + + if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) + rdmsrl(MSR_ARCH_CAPABILITIES, caps); + + if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO ) + return false; + + /* + * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at + * this time. + */ + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6 ) + return false; + + switch ( boot_cpu_data.x86_model ) + { + /* + * Core processors since at least Nehalem are vulnerable. + */ + case 0x1f: /* Auburndale / Havendale */ + case 0x1e: /* Nehalem */ + case 0x1a: /* Nehalem EP */ + case 0x2e: /* Nehalem EX */ + case 0x25: /* Westmere */ + case 0x2c: /* Westmere EP */ + case 0x2f: /* Westmere EX */ + case 0x2a: /* SandyBridge */ + case 0x2d: /* SandyBridge EP/EX */ + case 0x3a: /* IvyBridge */ + case 0x3e: /* IvyBridge EP/EX */ + case 0x3c: /* Haswell */ + case 0x3f: /* Haswell EX/EP */ + case 0x45: /* Haswell D */ + case 0x46: /* Haswell H */ + case 0x3d: /* Broadwell */ + case 0x47: /* Broadwell H */ + case 0x4f: /* Broadwell EP/EX */ + case 0x56: /* Broadwell D */ + case 0x4e: /* Skylake M */ + case 0x5e: /* Skylake D */ + case 0x55: /* Skylake-X / Cascade Lake */ + case 0x8e: /* Kaby / Coffee / Whiskey Lake M */ + case 0x9e: /* Kaby / Coffee / Whiskey Lake D */ + return true; + + /* + * Atom processors are not vulnerable. + */ + case 0x1c: /* Pineview */ + case 0x26: /* Lincroft */ + case 0x27: /* Penwell */ + case 0x35: /* Cloverview */ + case 0x36: /* Cedarview */ + case 0x37: /* Baytrail / Valleyview (Silvermont) */ + case 0x4d: /* Avaton / Rangely (Silvermont) */ + case 0x4c: /* Cherrytrail / Brasswell */ + case 0x4a: /* Merrifield */ + case 0x5a: /* Moorefield */ + case 0x5c: /* Goldmont */ + case 0x5d: /* SoFIA 3G Granite/ES2.1 */ + case 0x65: /* SoFIA LTE AOSP */ + case 0x5f: /* Denverton */ + case 0x6e: /* Cougar Mountain */ + case 0x75: /* Lightning Mountain */ + case 0x7a: /* Gemini Lake */ + case 0x86: /* Jacobsville */ + + /* + * Knights processors are not vulnerable. + */ + case 0x57: /* Knights Landing */ + case 0x85: /* Knights Mill */ + return false; + + default: + printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n", + boot_cpu_data.x86_model); + return true; + } +} + const struct hvm_function_table * __init start_vmx(void) { set_in_cr4(X86_CR4_VMXE); @@ -2435,6 +2531,17 @@ const struct hvm_function_table * __init start_vmx(void) */ if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) ) { + bool cpu_has_bug_pschange_mc = has_if_pschange_mc(); + + if ( opt_ept_exec_sp == -1 ) + { + /* Default to non-executable superpages on vulnerable hardware. */ + opt_ept_exec_sp = !cpu_has_bug_pschange_mc; + + if ( cpu_has_bug_pschange_mc ) + printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); + } + vmx_function_table.hap_supported = 1; vmx_function_table.altp2m_supported = 1; diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c index ce46201d45..93e08f89a2 100644 --- xen/arch/x86/mm/p2m-ept.c.orig +++ xen/arch/x86/mm/p2m-ept.c @@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry, break; } + /* + * Don't create executable superpages if we need to shatter them to + * protect against CVE-2018-12207. + */ + if ( !opt_ept_exec_sp && is_epte_superpage(entry) ) + entry->x = 0; } #define GUEST_TABLE_MAP_FAILED 0 diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index 89619e4afd..20eb7f6082 100644 --- xen/include/asm-x86/hvm/vmx/vmx.h.orig +++ xen/include/asm-x86/hvm/vmx/vmx.h @@ -28,6 +28,8 @@ #include <asm/hvm/trace.h> #include <asm/hvm/vmx/vmcs.h> +extern int8_t opt_ept_exec_sp; + typedef union { struct { u64 r : 1, /* bit 0 - Read permission */ diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h index b8151d2d9f..89ae3e03f1 100644 --- xen/include/asm-x86/msr-index.h.orig +++ xen/include/asm-x86/msr-index.h @@ -54,6 +54,7 @@ #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) +#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) From: Andrew Cooper <andrew.cooper3@citrix.com> Subject: x86/vtx: Allow runtime modification of the exec-sp setting See patch for details. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index 684671cb7b..33ed1ffc40 100644 --- docs/misc/xen-command-line.markdown.orig +++ docs/misc/xen-command-line.markdown @@ -861,6 +861,21 @@ Controls EPT related features. If HVM guest kernels are trusted not to mount a DoS against the system, this option can enabled to regain performance. + This boolean may be modified at runtime using `xl set-parameters + ept=[no-]exec-sp` to switch between fast and secure. + + * When switching from secure to fast, preexisting HVM domains will run + at their current performance until they are rebooted; new domains will + run without any overhead. + + * When switching from fast to secure, all HVM domains will immediately + suffer a performance penalty. + + **Warning: No guarantee is made that this runtime option will be retained + indefinitely, or that it will retain this exact behaviour. It is + intended as an emergency option for people who first chose fast, then + change their minds to secure, and wish not to reboot.** + ### extra\_guest\_irqs > `= [<domU number>][,<dom0 number>]` diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 8821a3b536..15376e25ba 100644 --- xen/arch/x86/hvm/vmx/vmcs.c.orig +++ xen/arch/x86/hvm/vmx/vmcs.c @@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s) } custom_param("ept", parse_ept_param); +static int parse_ept_param_runtime(const char *s) +{ + int val; + + if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || + !(hvm_funcs.hap_capabilities & + (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) + { + printk("VMX: EPT not available, or not in use - ignoring\n"); + return 0; + } + + if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 ) + return -EINVAL; + + if ( val != opt_ept_exec_sp ) + { + struct domain *d; + + opt_ept_exec_sp = val; + + rcu_read_lock(&domlist_read_lock); + for_each_domain ( d ) + if ( paging_mode_hap(d) ) + p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); + rcu_read_unlock(&domlist_read_lock); + } + + printk("VMX: EPT executable superpages %sabled\n", + val ? "en" : "dis"); + + return 0; +} +custom_runtime_only_param("ept", parse_ept_param_runtime); + /* Dynamic (run-time adjusted) execution control flags. */ u32 vmx_pin_based_exec_control __read_mostly; u32 vmx_cpu_based_exec_control __read_mostly; diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 2b62bc61dd..97c417fc3e 100644 --- xen/arch/x86/mm/p2m.c.orig +++ xen/arch/x86/mm/p2m.c @@ -257,17 +257,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start, return 0; } +/* + * May be called with ot = nt = p2m_ram_rw for its side effect of + * recalculating all PTEs in the p2m. + */ void p2m_change_entry_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) { struct p2m_domain *p2m = p2m_get_hostp2m(d); - ASSERT(ot != nt); ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt)); p2m_lock(p2m); p2m->change_entry_type_global(p2m, ot, nt); - p2m->global_logdirty = (nt == p2m_ram_logdirty); + /* Don't allow 'recalculate' operations to change the logdirty state. */ + if ( ot != nt ) + p2m->global_logdirty = (nt == p2m_ram_logdirty); p2m_unlock(p2m); }

$NetBSD: patch-XSA305,v 1.1 2019/11/13 13:36:11 bouyer Exp $ From: Andrew Cooper <andrew.cooper3@citrix.com> Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available To protect against the TSX Async Abort speculative vulnerability, Intel have released new microcode for affected parts which introduce the MSR_TSX_CTRL control, which allows TSX to be turned off. This will be architectural on future parts. Introduce tsx= to provide a global on/off for TSX, including its enumeration via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to guests at the moment. VMs may have booted before microcode is loaded, or before hosts have rebooted, and they still want to migrate freely. A VM which booted seeing TSX can migrate safely to hosts with TSX disabled - TSX will start unconditionally aborting, but still behave in a manner compatible with the ABI. The guest-visible behaviour is equivalent to late loading the microcode and setting the RTM_DISABLE bit in the course of live patching. This is part of XSA-305 / CVE-2019-11135 Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index 684671cb7b..b86d26399a 100644 --- docs/misc/xen-command-line.markdown.orig +++ docs/misc/xen-command-line.markdown @@ -1948,6 +1948,20 @@ pages) must also be specified via the tbuf\_size parameter. ### tsc (x86) > `= unstable | skewed | stable:socket` +### tsx + = <bool> + + Applicability: x86 + Default: true + +Controls for the use of Transactional Synchronization eXtensions. + +On Intel parts released in Q3 2019 (with updated microcode), and future parts, +a control has been introduced which allows TSX to be turned off. + +On systems with the ability to turn TSX off, this boolean offers system wide +control of whether TSX is enabled or disabled. + ### ucode (x86) > `= [<integer> | scan]` diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index da1e4827f4..4c82d9f710 100644 --- xen/arch/x86/Makefile.orig +++ xen/arch/x86/Makefile @@ -65,6 +65,7 @@ obj-y += sysctl.o obj-y += time.o obj-y += trace.o obj-y += traps.o +obj-y += tsx.o obj-y += usercopy.o obj-y += x86_emulate.o obj-$(CONFIG_TBOOT) += tboot.o diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c index 5e11970701..04aefa555d 100644 --- xen/arch/x86/cpuid.c.orig +++ xen/arch/x86/cpuid.c @@ -622,6 +622,20 @@ void recalculate_cpuid_policy(struct domain *d) if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) ) __set_bit(X86_FEATURE_ITSC, max_fs); + /* + * On hardware with MSR_TSX_CTRL, the admin may have elected to disable + * TSX and hide the feature bits. Migrating-in VMs may have been booted + * pre-mitigation when the TSX features were visbile. + * + * This situation is compatible (albeit with a perf hit to any TSX code in + * the guest), so allow the feature bits to remain set. + */ + if ( cpu_has_tsx_ctrl ) + { + __set_bit(X86_FEATURE_HLE, max_fs); + __set_bit(X86_FEATURE_RTM, max_fs); + } + /* Clamp the toolstacks choices to reality. */ for ( i = 0; i < ARRAY_SIZE(fs); i++ ) fs[i] &= max_fs[i]; diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c index ebc0665615..35d99a98a1 100644 --- xen/arch/x86/msr.c.orig +++ xen/arch/x86/msr.c @@ -153,6 +153,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) case MSR_FLUSH_CMD: /* Write-only */ case MSR_TSX_FORCE_ABORT: + case MSR_TSX_CTRL: /* Not offered to guests. */ goto gp_fault; @@ -233,6 +234,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) case MSR_ARCH_CAPABILITIES: /* Read-only */ case MSR_TSX_FORCE_ABORT: + case MSR_TSX_CTRL: /* Not offered to guests. */ goto gp_fault; diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 657160549f..dc13ad6c36 100644 --- xen/arch/x86/setup.c.orig +++ xen/arch/x86/setup.c @@ -1551,6 +1551,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) early_microcode_init(); + tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ + identify_cpu(&boot_cpu_data); set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT); diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index fd52a10cf9..bdc118d88b 100644 --- xen/arch/x86/smpboot.c.orig +++ xen/arch/x86/smpboot.c @@ -376,6 +376,8 @@ void start_secondary(void *unused) if ( boot_cpu_has(X86_FEATURE_IBRSB) ) wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); + tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ + if ( xen_guest ) hypervisor_ap_setup(); diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c new file mode 100644 index 0000000000..a8ec2ccc69 --- /dev/null +++ xen/arch/x86/tsx.c @@ -0,0 +1,74 @@ +#include <xen/init.h> +#include <asm/msr.h> + +/* + * Valid values: + * 1 => Explicit tsx=1 + * 0 => Explicit tsx=0 + * -1 => Default, implicit tsx=1 + * + * This is arranged such that the bottom bit encodes whether TSX is actually + * disabled, while identifying various explicit (>=0) and implicit (<0) + * conditions. + */ +int8_t __read_mostly opt_tsx = -1; +int8_t __read_mostly cpu_has_tsx_ctrl = -1; + +static int __init parse_tsx(const char *s) +{ + int rc = 0, val = parse_bool(s, NULL); + + if ( val >= 0 ) + opt_tsx = val; + else + rc = -EINVAL; + + return rc; +} +custom_param("tsx", parse_tsx); + +void tsx_init(void) +{ + /* + * This function is first called between microcode being loaded, and CPUID + * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL + * is available. + */ + if ( unlikely(cpu_has_tsx_ctrl < 0) ) + { + uint64_t caps = 0; + + if ( boot_cpu_data.cpuid_level >= 7 && + (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) ) + rdmsrl(MSR_ARCH_CAPABILITIES, caps); + + cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL); + } + + if ( cpu_has_tsx_ctrl ) + { + uint64_t val; + + rdmsrl(MSR_TSX_CTRL, val); + + val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR); + /* Check bottom bit only. Higher bits are various sentinals. */ + if ( !(opt_tsx & 1) ) + val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_TSX_CTRL, val); + } + else if ( opt_tsx >= 0 ) + printk_once(XENLOG_WARNING + "MSR_TSX_CTRL not available - Ignoring tsx= setting\n"); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h index 89ae3e03f1..5ee7a37c12 100644 --- xen/include/asm-x86/msr-index.h.orig +++ xen/include/asm-x86/msr-index.h @@ -55,6 +55,7 @@ #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) +#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) @@ -62,6 +63,10 @@ #define MSR_TSX_FORCE_ABORT 0x0000010f #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0) +#define MSR_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0) +#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1) + /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_A_PERFCTR0 0x000004c1 diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 20d1ecb332..66224f23b9 100644 --- xen/include/asm-x86/processor.h.orig +++ xen/include/asm-x86/processor.h @@ -258,6 +258,16 @@ static always_inline unsigned int cpuid_count_ebx( return ebx; } +static always_inline unsigned int cpuid_count_edx( + unsigned int leaf, unsigned int subleaf) +{ + unsigned int edx, tmp; + + cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx); + + return edx; +} + static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf, struct cpuid_leaf *data) { @@ -610,6 +620,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model, return fam; } +extern int8_t opt_tsx, cpu_has_tsx_ctrl; +void tsx_init(void); + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_X86_PROCESSOR_H */ diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h index 750f809968..be223a6950 100644 --- xen/include/xen/lib.h.orig +++ xen/include/xen/lib.h @@ -116,6 +116,16 @@ extern int printk_ratelimit(void); #define gprintk(lvl, fmt, args...) \ printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args) +#define printk_once(fmt, args...) \ +({ \ + static bool __read_mostly once_; \ + if ( unlikely(!once_) ) \ + { \ + once_ = true; \ + printk(fmt, ## args); \ + } \ +}) + #ifdef NDEBUG static inline void From: Andrew Cooper <andrew.cooper3@citrix.com> Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel See patch documentation and comments. This is part of XSA-305 / CVE-2019-11135 Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index b86d26399a..31635a473a 100644 --- docs/misc/xen-command-line.markdown.orig +++ docs/misc/xen-command-line.markdown @@ -1841,7 +1841,7 @@ extreme care.** An overall boolean value, `spec-ctrl=no`, can be specified to turn off all mitigations, including pieces of infrastructure used to virtualise certain mitigation features for guests. This also includes settings which `xpti`, -`smt`, `pv-l1tf` control, unless the respective option(s) have been +`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been specified earlier on the command line. Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to @@ -1952,7 +1952,7 @@ pages) must also be specified via the tbuf\_size parameter. = <bool> Applicability: x86 - Default: true + Default: false on parts vulnerable to TAA, true otherwise Controls for the use of Transactional Synchronization eXtensions. @@ -1962,6 +1962,19 @@ a control has been introduced which allows TSX to be turned off. On systems with the ability to turn TSX off, this boolean offers system wide control of whether TSX is enabled or disabled. +On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following +logic applies: + + * An explicit `tsx=` choice is honoured, even if it is `true` and would + result in a vulnerable system. + + * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be + mitigated by disabling TSX, as this is the lowest overhead option. + + * If the use of TSX is important, the more expensive TAA mitigations can be + opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain + active by default. + ### ucode (x86) > `= [<integer> | scan]` diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c index 2fe16b423d..ab196b156d 100644 --- xen/arch/x86/spec_ctrl.c.orig +++ xen/arch/x86/spec_ctrl.c @@ -152,6 +152,9 @@ static int __init parse_spec_ctrl(const char *s) if ( opt_pv_l1tf_domu < 0 ) opt_pv_l1tf_domu = 0; + if ( opt_tsx == -1 ) + opt_tsx = -3; + disable_common: opt_rsb_pv = false; opt_rsb_hvm = false; @@ -362,7 +365,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) printk("Speculative mitigation facilities:\n"); /* Hardware features which pertain to speculative mitigations. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", @@ -374,7 +377,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", - (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : ""); + (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", + (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); /* Compiled-in support which pertains to mitigations. */ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) @@ -388,7 +393,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) "\n"); /* Settings for Xen's protection, irrespective of guests. */ - printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n", + printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n", thunk == THUNK_NONE ? "N/A" : thunk == THUNK_RETPOLINE ? "RETPOLINE" : thunk == THUNK_LFENCE ? "LFENCE" : @@ -397,6 +402,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", !boot_cpu_has(X86_FEATURE_SSBD) ? "" : (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", + !(caps & ARCH_CAPS_TSX_CTRL) ? "" : + (opt_tsx & 1) ? " TSX+" : " TSX-", opt_ibpb ? " IBPB" : "", opt_l1d_flush ? " L1D_FLUSH" : "", opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : ""); @@ -911,6 +918,7 @@ void __init init_speculation_mitigations(void) { enum ind_thunk thunk = THUNK_DEFAULT; bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; + bool cpu_has_bug_taa; uint64_t caps = 0; if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) @@ -1140,6 +1148,53 @@ void __init init_speculation_mitigations(void) "enabled. Mitigations will not be fully effective. Please\n" "choose an explicit smt=<bool> setting. See XSA-297.\n"); + /* + * Vulnerability to TAA is a little complicated to quantify. + * + * In the pipeline, it is just another way to get speculative access to + * stale load port, store buffer or fill buffer data, and therefore can be + * considered a superset of MDS (on TSX-capable parts). On parts which + * predate MDS_NO, the existing VERW flushing will mitigate this + * sidechannel as well. + * + * On parts which contain MDS_NO, the lack of VERW flushing means that an + * attacker can still use TSX to target microarchitectural buffers to leak + * secrets. Therefore, we consider TAA to be the set of TSX-capable parts + * which have MDS_NO but lack TAA_NO. + * + * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the + * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so + * we check both to spot TSX in a microcode/cmdline independent way. + */ + cpu_has_bug_taa = + (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) && + (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO; + + /* + * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs + * the MDS mitigation of disabling HT and using VERW flushing. + * + * On CPUs which advertise MDS_NO, VERW has no flushing side effect until + * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being + * advertised, and there isn't a MD_CLEAR_2 flag to use... + * + * If we're on affected hardware, able to do something about it (which + * implies that VERW now works), no explicit TSX choice and traditional + * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might + * plausibly value TSX higher than Hyperthreading...), disable TSX to + * mitigate TAA. + */ + if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) && + ((hw_smt_enabled && opt_smt) || + !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) ) + { + setup_clear_cpu_cap(X86_FEATURE_HLE); + setup_clear_cpu_cap(X86_FEATURE_RTM); + + opt_tsx = 0; + tsx_init(); + } + print_details(thunk, caps); /* diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c index a8ec2ccc69..2d202a0d4e 100644 --- xen/arch/x86/tsx.c.orig +++ xen/arch/x86/tsx.c @@ -5,7 +5,8 @@ * Valid values: * 1 => Explicit tsx=1 * 0 => Explicit tsx=0 - * -1 => Default, implicit tsx=1 + * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA + * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) * * This is arranged such that the bottom bit encodes whether TSX is actually * disabled, while identifying various explicit (>=0) and implicit (<0) diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h index 5ee7a37c12..1761a01f1f 100644 --- xen/include/asm-x86/msr-index.h.orig +++ xen/include/asm-x86/msr-index.h @@ -56,6 +56,7 @@ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) +#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)