@@ -1,18 +1,18 @@
-# $NetBSD: Makefile,v 1.16 2020/10/02 13:00:48 bouyer Exp $
+# $NetBSD: Makefile,v 1.17 2020/10/21 09:03:05 bouyer Exp $
 VERSION=	4.11.4
 #keep >= 1 if we have security patches
-PKGREVISION=	2
+PKGREVISION=	3
 DISTNAME=	xen-${VERSION}
 PKGNAME=	xenkernel411-${VERSION}
 CATEGORIES=	sysutils
 MASTER_SITES=	https://downloads.xenproject.org/release/xen/${VERSION}/
 DIST_SUBDIR=	xen411
 MAINTAINER=	bouyer@NetBSD.org
 HOMEPAGE=	https://xenproject.org/
 COMMENT=	Xen 4.11.x Kernel
 LICENSE=	gnu-gpl-v2
 ONLY_FOR_PLATFORM=	Linux-2.6*-x86_64

 @@ -1,27 +1,31 @@
-$NetBSD: distinfo,v 1.14 2020/10/02 13:00:48 bouyer Exp $
+$NetBSD: distinfo,v 1.15 2020/10/21 09:03:05 bouyer Exp $
 SHA1 (xen411/xen-4.11.4.tar.gz) = 6c8cdf441621c14dc5345196b48df6982c060c4f
 RMD160 (xen411/xen-4.11.4.tar.gz) = 49819fcd1de3985d4dea370be962548c862f2933
 SHA512 (xen411/xen-4.11.4.tar.gz) = 8383f0b369fa08c8ecfdd68f902a2aaad140146a183131c50c020fe04c2f1e829c219b9bd9923fa8f1c180e1e7c6e73d0d68b7015fc39fd3b7f59e55c680cedb
 Size (xen411/xen-4.11.4.tar.gz) = 25184564 bytes
 SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
 SHA1 (patch-XSA286) = c7c5cc192be821721919cc035515ddf55d2c0658
 SHA1 (patch-XSA317) = 3a3e7bf8f115bebaf56001afcf68c2bd501c00a5
 SHA1 (patch-XSA319) = 4954bdc849666e1c735c3281256e4850c0594ee8
 SHA1 (patch-XSA320) = 38d84a2ded4ccacee455ba64eb3b369e5661fbfd
 SHA1 (patch-XSA321) = 1f15b2e3c0f7f2d7335879d3a83c1557ac9de806
 SHA1 (patch-XSA328) = a9b02c183a5dbfb6c0fe50824f18896fcab4a9e9
 SHA1 (patch-XSA333) = 47660b70b2c998436587600bb9a25c2f494afa49
 SHA1 (patch-XSA336) = da0a8bb05877917c75a28155cf2dd2f66d11ef9c
 SHA1 (patch-XSA337) = f323b4c596f8a7b2b3d57dd799f70cf62743369f
 SHA1 (patch-XSA338) = 0adcebec2c25a389155a10de84bf999ff2e5425d
 SHA1 (patch-XSA339) = 4f97076bda8150d1b1c68f6000d563f3c3314c02
 SHA1 (patch-XSA340) = 23888acfe25fc82ff085fa9acfbb36c156a15bc3
 SHA1 (patch-XSA342) = a61c4e28a8c8219b88e3bab534a109b2b29e2cc3
 SHA1 (patch-XSA343) = 239822636b474ebb62aa455cfdbd9853c4fb342f
 SHA1 (patch-XSA344) = cf7184ac9263b418305c6a7fbae7b163b233b4bc
 SHA1 (patch-XSA345) = 14ab754703af1045b2d049de1c6ba1c5baca5d81
 SHA1 (patch-XSA346) = c1962c037c5ab62c2f7e9a558c4565331c981be0
 SHA1 (patch-XSA347) = f3f98a794584d5d4321b95c2b1b9c88821fa567e
 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
 SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
 SHA1 (patch-xen_arch_x86_boot_build32.mk) = b82c20de9b86ddaa9d05bbc1ff28f970eb78473c
 SHA1 (patch-xen_tools_symbols.c) = 6070b3b5ccc38a196283cfc1c52f5d87858beb18
 SHA1 (patch-zz-bouyer) = bf11b2b81d5c81992c911f670e75dd3aec5ab609

$NetBSD: patch-XSA286,v 1.1 2020/10/21 09:03:05 bouyer Exp $ From: Jan Beulich <jbeulich@suse.com> Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV The flag is really only meant for those, both HVM and 32-bit PV tell kernel from user mode based on CPL/RPL. Remove the all-question-marks comment and let's be on the safe side here and also suppress clearing for 32-bit PV (this isn't a fast path after all). Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and sh_walk_guest_tables(). Note that shadow_one_bit_disable() already assumes the new behavior. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Wei Liu <wei.liu2@citrix.com> Acked-by: George Dunlap <george.dunlap@citrix.com> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 35857dbe86..1d0ac81c5b 100644 --- xen/arch/x86/domain.c.orig +++ xen/arch/x86/domain.c @@ -804,9 +804,15 @@ int arch_set_info_guest( v->fpu_initialised = !!(flags & VGCF_I387_VALID); - v->arch.flags &= ~TF_kernel_mode; - if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ ) - v->arch.flags |= TF_kernel_mode; + v->arch.flags |= TF_kernel_mode; + if ( unlikely(!(flags & VGCF_in_kernel)) && + /* + * TF_kernel_mode is only allowed to be clear for 64-bit PV. See + * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and + * shadow_one_bit_disable() for why that is. + */ + !is_hvm_domain(d) && !is_pv_32bit_domain(d) ) + v->arch.flags &= ~TF_kernel_mode; v->arch.vgc_flags = flags; diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c index 8ab343d16e..a2ebb4943f 100644 --- xen/arch/x86/mm/shadow/multi.c.orig +++ xen/arch/x86/mm/shadow/multi.c @@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, INVALID_MFN, v->arch.paging.shadow.gl3e); #else /* 32 or 64 */ const struct domain *d = v->domain; - mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d) + mfn_t root_mfn = (v->arch.flags & TF_kernel_mode ? pagetable_get_mfn(v->arch.guest_table) : pagetable_get_mfn(v->arch.guest_table_user)); void *root_map = map_domain_page(root_mfn); @@ -4018,7 +4018,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) v, (unsigned long)pagetable_get_pfn(v->arch.guest_table)); #if GUEST_PAGING_LEVELS == 4 - if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) ) + if ( !(v->arch.flags & TF_kernel_mode) ) gmfn = pagetable_get_mfn(v->arch.guest_table_user); else #endif From: Jan Beulich <jbeulich@suse.com> Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk() The L3 one at least is going to be re-used by a subsequent patch, and splitting the L4 one then as well seems only natural. This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 3bd157967a..e73daa55e4 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START; l2_pgentry_t *compat_idle_pg_table_l2; -void *do_page_walk(struct vcpu *v, unsigned long addr) +static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); - l4_pgentry_t l4e, *l4t; - l3_pgentry_t l3e, *l3t; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; + unsigned long mfn = pagetable_get_pfn(root); + l4_pgentry_t *l4t, l4e; - if ( !is_pv_vcpu(v) || !is_canonical_address(addr) ) - return NULL; + if ( !is_canonical_address(addr) ) + return l4e_empty(); l4t = map_domain_page(_mfn(mfn)); l4e = l4t[l4_table_offset(addr)]; unmap_domain_page(l4t); + + return l4e; +} + +static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) +{ + l4_pgentry_t l4e = page_walk_get_l4e(root, addr); + l3_pgentry_t *l3t, l3e; + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return NULL; + return l3e_empty(); l3t = map_l3t_from_l4e(l4e); l3e = l3t[l3_table_offset(addr)]; unmap_domain_page(l3t); + + return l3e; +} + +void *do_page_walk(struct vcpu *v, unsigned long addr) +{ + l3_pgentry_t l3e; + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long mfn; + + if ( !is_pv_vcpu(v) ) + return NULL; + + l3e = page_walk_get_l3e(v->arch.guest_table, addr); mfn = l3e_get_pfn(l3e); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) return NULL; From: Jan Beulich <jbeulich@suse.com> Subject: x86/mm: check page types in do_page_walk() For page table entries read to be guaranteed valid, transiently locking the pages and validating their types is necessary. Note that guest use of linear page tables is intentionally not taken into account here, as ordinary data (guest stacks) can't possibly live inside page tables. This is part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index e73daa55e4..1ca9547d68 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2; static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(root); - l4_pgentry_t *l4t, l4e; + mfn_t mfn = pagetable_get_mfn(root); + /* current's root page table can't disappear under our feet. */ + bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table)); + struct page_info *pg; + l4_pgentry_t l4e = l4e_empty(); if ( !is_canonical_address(addr) ) return l4e_empty(); - l4t = map_domain_page(_mfn(mfn)); - l4e = l4t[l4_table_offset(addr)]; - unmap_domain_page(l4t); + pg = mfn_to_page(mfn); + if ( need_lock && !page_lock(pg) ) + return l4e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table ) + { + l4_pgentry_t *l4t = map_domain_page(mfn); + + l4e = l4t[l4_table_offset(addr)]; + unmap_domain_page(l4t); + } + + if ( need_lock ) + page_unlock(pg); return l4e; } @@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) { l4_pgentry_t l4e = page_walk_get_l4e(root, addr); - l3_pgentry_t *l3t, l3e; + mfn_t mfn = l4e_get_mfn(l4e); + struct page_info *pg; + l3_pgentry_t l3e = l3e_empty(); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return l3e_empty(); - l3t = map_l3t_from_l4e(l4e); - l3e = l3t[l3_table_offset(addr)]; - unmap_domain_page(l3t); + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l3e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table ) + { + l3_pgentry_t *l3t = map_domain_page(mfn); + + l3e = l3t[l3_table_offset(addr)]; + unmap_domain_page(l3t); + } + + page_unlock(pg); return l3e; } @@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - unsigned long mfn; + l2_pgentry_t l2e = l2e_empty(); + l1_pgentry_t l1e = l1e_empty(); + mfn_t mfn; + struct page_info *pg; if ( !is_pv_vcpu(v) ) return NULL; l3e = page_walk_get_l3e(v->arch.guest_table, addr); - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + mfn = l3e_get_mfn(l3e); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l3e_get_flags(l3e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1))); goto ret; } - l2t = map_domain_page(_mfn(mfn)); - l2e = l2t[l2_table_offset(addr)]; - unmap_domain_page(l2t); - mfn = l2e_get_pfn(l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + const l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + mfn = l2e_get_mfn(l2e); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l2e_get_flags(l2e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1))); goto ret; } - l1t = map_domain_page(_mfn(mfn)); - l1e = l1t[l1_table_offset(addr)]; - unmap_domain_page(l1t); - mfn = l1e_get_pfn(l1e); - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + const l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + unmap_domain_page(l1t); + } + + page_unlock(pg); + + mfn = l1e_get_mfn(l1e); + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; ret: - return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK); + return map_domain_page(mfn) + (addr & ~PAGE_MASK); } /* From: Jan Beulich <jbeulich@suse.com> Subject: x86/mm: avoid using linear page tables in map_guest_l1e() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the linear L2 table access by an actual page walk. This is part of XSA-286. Reported-by: Jann Horn <jannh@google.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c index 80bf280fb2..ee08c13881 100644 --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) if ( unlikely(!__addr_ok(linear)) ) return NULL; - /* Find this l1e and its enclosing l1mfn in the linear map. */ - if ( __copy_from_user(&l2e, - &__linear_l2_table[l2_linear_offset(linear)], - sizeof(l2_pgentry_t)) ) + if ( unlikely(!(current->arch.flags & TF_kernel_mode)) ) + { + ASSERT_UNREACHABLE(); return NULL; + } + + /* Find this l1e and its enclosing l1mfn. */ + l2e = page_walk_get_l2e(current->arch.guest_table, linear); /* Check flags that it will be safe to read the l1e. */ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT ) diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 1ca9547d68..dfa33ba894 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) return l3e; } +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) +{ + l3_pgentry_t l3e = page_walk_get_l3e(root, addr); + mfn_t mfn = l3e_get_mfn(l3e); + struct page_info *pg; + l2_pgentry_t l2e = l2e_empty(); + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || + (l3e_get_flags(l3e) & _PAGE_PSE) ) + return l2e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l2e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + return l2e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 7825691d06..afafe87fe7 100644 --- xen/include/asm-x86/mm.h.orig +++ xen/include/asm-x86/mm.h @@ -585,7 +585,9 @@ void audit_domains(void); void make_cr3(struct vcpu *v, mfn_t mfn); void update_cr3(struct vcpu *v); int vcpu_destroy_pagetables(struct vcpu *); + void *do_page_walk(struct vcpu *v, unsigned long addr); +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); int __sync_local_execstate(void); From: Jan Beulich <jbeulich@suse.com> Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e() First of all drop guest_get_eff_l1e() entirely - there's no actual user of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around its only call site. Then replace the linear L1 table access by an actual page walk. This is part of XSA-286. Reported-by: Jann Horn <jannh@google.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c index ee08c13881..c70785d0cf 100644 --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) } /* - * Read the guest's l1e that maps this address, from the kernel-mode - * page tables. - */ -static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) -{ - struct vcpu *curr = current; - const bool user_mode = !(curr->arch.flags & TF_kernel_mode); - l1_pgentry_t l1e; - - if ( user_mode ) - toggle_guest_pt(curr); - - l1e = guest_get_eff_l1e(linear); - - if ( user_mode ) - toggle_guest_pt(curr); - - return l1e; -} - -/* * Map a guest's LDT page (covering the byte at @offset from start of the LDT) * into Xen's virtual range. Returns true if the mapping changed, false * otherwise. diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h index 976209ba4c..cc4ee1affb 100644 --- xen/arch/x86/pv/mm.h.orig +++ xen/arch/x86/pv/mm.h @@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn); int new_guest_cr3(mfn_t mfn); -/* Read a PV guest's l1e that maps this linear address. */ -static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear) +/* + * Read the guest's l1e that maps this address, from the kernel-mode + * page tables. + */ +static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) { - l1_pgentry_t l1e; + l1_pgentry_t l1e = l1e_empty(); ASSERT(!paging_mode_translate(current->domain)); ASSERT(!paging_mode_external(current->domain)); - if ( unlikely(!__addr_ok(linear)) || - __copy_from_user(&l1e, - &__linear_l1_table[l1_linear_offset(linear)], - sizeof(l1_pgentry_t)) ) - l1e = l1e_empty(); + if ( likely(__addr_ok(linear)) ) + l1e = page_walk_get_l1e(current->arch.guest_table, linear); return l1e; } diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c index a3c0c2dd19..c9ee5156f8 100644 --- xen/arch/x86/pv/ro-page-fault.c.orig +++ xen/arch/x86/pv/ro-page-fault.c @@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) bool mmio_ro; /* Attempt to read the PTE that maps the VA being accessed. */ - pte = guest_get_eff_l1e(addr); + pte = guest_get_eff_kern_l1e(addr); /* We are only looking for read-only mappings */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) ) diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index dfa33ba894..cca7ea6e9d 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) return l2e; } +/* + * For now no "set_accessed" parameter, as all callers want it set to true. + * For now also no "set_dirty" parameter, as all callers deal with r/o + * mappings, and we don't want to set the dirty bit there (conflicts with + * CET-SS). However, as there are CPUs which may set the dirty bit on r/o + * PTEs, the logic below tolerates the bit becoming set "behind our backs". + */ +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr) +{ + l2_pgentry_t l2e = page_walk_get_l2e(root, addr); + mfn_t mfn = l2e_get_mfn(l2e); + struct page_info *pg; + l1_pgentry_t l1e = l1e_empty(); + + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || + (l2e_get_flags(l2e) & _PAGE_PSE) ) + return l1e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l1e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + + if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) == + _PAGE_PRESENT ) + { + l1_pgentry_t ol1e = l1e; + + l1e_add_flags(l1e, _PAGE_ACCESSED); + /* + * Best effort only; with the lock held the page shouldn't + * change anyway, except for the dirty bit to perhaps become set. + */ + while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]), + l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) != + l1e_get_intpte(ol1e) && + !(l1e_get_flags(l1e) & _PAGE_DIRTY) ) + { + l1e_add_flags(ol1e, _PAGE_DIRTY); + l1e_add_flags(l1e, _PAGE_DIRTY); + } + } + + unmap_domain_page(l1t); + } + + page_unlock(pg); + + return l1e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index afafe87fe7..423313ae3a 100644 --- xen/include/asm-x86/mm.h.orig +++ xen/include/asm-x86/mm.h @@ -588,6 +588,7 @@ int vcpu_destroy_pagetables(struct vcpu *); void *do_page_walk(struct vcpu *v, unsigned long addr); l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr); int __sync_local_execstate(void); From: Jan Beulich <jbeulich@suse.com> Subject: x86/mm: avoid using top level linear page tables in {,un}map_domain_page() Move the page table recursion two levels down. This entails avoiding to free the recursive mapping prematurely in free_perdomain_mappings(). This is part of XSA-286. Reported-by: Jann Horn <jannh@google.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c index 0c24530ed9..d89fa27f8e 100644 --- xen/arch/x86/domain_page.c.orig +++ xen/arch/x86/domain_page.c @@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v) #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) #define MAPCACHE_L1ENT(idx) \ - __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] + ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \ + ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx] void *map_domain_page(mfn_t mfn) { @@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d) { struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; unsigned int bitmap_pages; + int rc; ASSERT(is_pv_domain(d)); @@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d) return 0; #endif + BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + - 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > + 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) + + (1U << L2_PAGETABLE_SHIFT) > MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; @@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d) spin_lock_init(&dcache->lock); - return create_perdomain_mapping(d, (unsigned long)dcache->inuse, - 2 * bitmap_pages + 1, - NIL(l1_pgentry_t *), NULL); + rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, + 2 * bitmap_pages + 1, + NIL(l1_pgentry_t *), NULL); + if ( !rc ) + { + /* + * Install mapping of our L2 table into its own last slot, for easy + * access to the L1 entries via MAPCACHE_L1ENT(). + */ + l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg); + l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)]; + l2_pgentry_t *l2t = map_l2t_from_l3e(l3e); + + l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e); + unmap_domain_page(l2t); + unmap_domain_page(l3t); + } + + return rc; } int mapcache_vcpu_init(struct vcpu *v) @@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr) else { ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); - pl1e = &__linear_l1_table[l1_linear_offset(va)]; + pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START)); } return l1e_get_mfn(*pl1e); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 626768a950..8f975a747d 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -6038,6 +6038,10 @@ void free_perdomain_mappings(struct domain *d) { struct page_info *l1pg = l2e_get_page(l2tab[j]); + /* mapcache_domain_init() installs a recursive entry. */ + if ( l1pg == l2pg ) + continue; + if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) { l1_pgentry_t *l1tab = __map_domain_page(l1pg); From: Jan Beulich <jbeulich@suse.com> Subject: x86/mm: restrict use of linear page tables to shadow mode code Other code does not require them to be set up anymore, so restrict when to populate the respective L4 slot and reduce visibility of the accessors. While with the removal of all uses the vulnerability is actually fixed, removing the creation of the linear mapping adds an extra layer of protection. Similarly reducing visibility of the accessors mostly eliminates the risk of undue re-introduction of uses of the linear mappings. This is (not strictly) part of XSA-286. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 8f975a747d..10175764e8 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -1755,9 +1755,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn, l4t[l4_table_offset(PCI_MCFG_VIRT_START)] = idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)]; - /* Slot 258: Self linear mappings. */ + /* Slot 258: Self linear mappings (shadow pt only). */ ASSERT(!mfn_eq(l4mfn, INVALID_MFN)); l4t[l4_table_offset(LINEAR_PT_VIRT_START)] = + !shadow_mode_external(d) ? l4e_empty() : l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW); /* Slot 259: Shadow linear mappings (if applicable) .*/ diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h index c7fa18925b..1933a6a2a2 100644 --- xen/arch/x86/mm/shadow/private.h.orig +++ xen/arch/x86/mm/shadow/private.h @@ -137,6 +137,15 @@ enum { # define GUEST_PTE_SIZE 4 #endif +/* Where to find each level of the linear mapping */ +#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define __linear_l2_table \ + ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) + /****************************************************************************** * Auditing routines */ diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index cca7ea6e9d..d7551e594a 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@ -833,9 +833,6 @@ void __init paging_init(void) machine_to_phys_mapping_valid = 1; - /* Set up linear page table mapping. */ - l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], - l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW)); return; nomem: diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 9ef9d03ca7..4670ab99f6 100644 --- xen/include/asm-x86/config.h.orig +++ xen/include/asm-x86/config.h @@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128]; */ #define PCI_MCFG_VIRT_START (PML4_ADDR(257)) #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES) -/* Slot 258: linear page table (guest table). */ +/* Slot 258: linear page table (monitor table, HVM only). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) /* Slot 259: linear page table (shadow table). */ diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index c1e92937c0..e72c277b9f 100644 --- xen/include/asm-x86/page.h.orig +++ xen/include/asm-x86/page.h @@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *); #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))) #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) -#endif /* !defined(__ASSEMBLY__) */ - -/* Where to find each level of the linear mapping */ -#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) - - -#ifndef __ASSEMBLY__ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; extern l2_pgentry_t *compat_idle_pg_table_l2; extern unsigned int m2p_compat_vstart;

$NetBSD: patch-XSA345,v 1.1 2020/10/21 09:03:05 bouyer Exp $ From edbe70427e17743351f1b739ea1536acd757ae6c Mon Sep 17 00:00:00 2001 From: Wei Liu <wei.liu2@citrix.com> Date: Sat, 11 Jan 2020 21:57:41 +0000 Subject: [PATCH 1/3] x86/mm: Refactor map_pages_to_xen to have only a single exit path We will soon need to perform clean-ups before returning. No functional change. This is part of XSA-345. Reported-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: Wei Liu <wei.liu2@citrix.com> Signed-off-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> Acked-by: Jan Beulich <jbeulich@suse.com> --- xen/arch/x86/mm.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 626768a950..79a3fac3cc 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -5194,6 +5194,7 @@ int map_pages_to_xen( l2_pgentry_t *pl2e, ol2e; l1_pgentry_t *pl1e, ol1e; unsigned int i; + int rc = -ENOMEM; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ @@ -5214,7 +5215,8 @@ int map_pages_to_xen( l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) - return -ENOMEM; + goto out; + ol3e = *pl3e; if ( cpu_has_page1gb && @@ -5302,7 +5304,7 @@ int map_pages_to_xen( pl2e = alloc_xen_pagetable(); if ( pl2e == NULL ) - return -ENOMEM; + goto out; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(pl2e + i, @@ -5331,7 +5333,7 @@ int map_pages_to_xen( pl2e = virt_to_xen_l2e(virt); if ( !pl2e ) - return -ENOMEM; + goto out; if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & ((1u << PAGETABLE_ORDER) - 1)) == 0) && @@ -5374,7 +5376,7 @@ int map_pages_to_xen( { pl1e = virt_to_xen_l1e(virt); if ( pl1e == NULL ) - return -ENOMEM; + goto out; } else if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { @@ -5401,7 +5403,7 @@ int map_pages_to_xen( pl1e = alloc_xen_pagetable(); if ( pl1e == NULL ) - return -ENOMEM; + goto out; for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&pl1e[i], @@ -5545,7 +5547,10 @@ int map_pages_to_xen( #undef flush_flags - return 0; + rc = 0; + + out: + return rc; } int populate_pt_range(unsigned long virt, unsigned long nr_mfns) -- 2.25.1 From 7101786be91dce650b6e79f1374c580c731bb348 Mon Sep 17 00:00:00 2001 From: Wei Liu <wei.liu2@citrix.com> Date: Sat, 11 Jan 2020 21:57:42 +0000 Subject: [PATCH 2/3] x86/mm: Refactor modify_xen_mappings to have one exit path We will soon need to perform clean-ups before returning. No functional change. This is part of XSA-345. Reported-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: Wei Liu <wei.liu2@citrix.com> Signed-off-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> Acked-by: Jan Beulich <jbeulich@suse.com> --- xen/arch/x86/mm.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 79a3fac3cc..8ed3ecacbe 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -5577,6 +5577,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) l1_pgentry_t *pl1e; unsigned int i; unsigned long v = s; + int rc = -ENOMEM; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) @@ -5618,7 +5619,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) /* PAGE1GB: shatter the superpage and fall through. */ pl2e = alloc_xen_pagetable(); if ( !pl2e ) - return -ENOMEM; + goto out; + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(pl2e + i, l2e_from_pfn(l3e_get_pfn(*pl3e) + @@ -5673,7 +5675,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) /* PSE: shatter the superpage and try again. */ pl1e = alloc_xen_pagetable(); if ( !pl1e ) - return -ENOMEM; + goto out; + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&pl1e[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, @@ -5802,7 +5805,10 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) flush_area(NULL, FLUSH_TLB_GLOBAL); #undef FLAGS_MASK - return 0; + rc = 0; + + out: + return rc; } #undef flush_area -- 2.25.1 From e7bbc4a0b5af76a82f0dcf4afcbf1509b020eb73 Mon Sep 17 00:00:00 2001 From: Hongyan Xia <hongyxia@amazon.com> Date: Sat, 11 Jan 2020 21:57:43 +0000 Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB superpages if possible, to maximize TLB efficiency. This means both replacing superpage entries with smaller entries, and replacing smaller entries with superpages. Unfortunately, while some potential races are handled correctly, others are not. These include: 1. When one processor modifies a sub-superpage mapping while another processor replaces the entire range with a superpage. Take the following example: Suppose L3[N] points to L2. And suppose we have two processors, A and B. * A walks the pagetables, get a pointer to L2. * B replaces L3[N] with a 1GiB mapping. * B Frees L2 * A writes L2[M] # This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't handle higher-level superpages properly: If you call virt_xen_to_l2e on a virtual address within an L3 superpage, you'll either hit a BUG() (most likely), or get a pointer into the middle of a data page; same with virt_xen_to_l1 on a virtual address within either an L3 or L2 superpage. So take the following example: * A reads pl3e and discovers it to point to an L2. * B replaces L3[N] with a 1GiB mapping * A calls virt_to_xen_l2e() and hits the BUG_ON() # 2. When two processors simultaneously try to replace a sub-superpage mapping with a superpage mapping. Take the following example: Suppose L3[N] points to L2. And suppose we have two processors, A and B, both trying to replace L3[N] with a superpage. * A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2. * B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2. * A writes the new value into L3[N] * B writes the new value into L3[N] * A recursively frees all the L1's under L2, then frees L2 * B recursively double-frees all the L1's under L2, then double-frees L2 # Fix this by grabbing a lock for the entirety of the mapping update operation. Rather than grabbing map_pgdir_lock for the entire operation, however, repurpose the PGT_locked bit from L3's page->type_info as a lock. This means that rather than locking the entire address space, we "only" lock a single 512GiB chunk of hypervisor address space at a time. There was a proposal for a lock-and-reverify approach, where we walk the pagetables to the point where we decide what to do; then grab the map_pgdir_lock, re-verify the information we collected without the lock, and finally make the change (starting over again if anything had changed). Without being able to guarantee that the L2 table wasn't freed, however, that means every read would need to be considered potentially unsafe. Thinking carefully about that is probably something that wants to be done on public, not under time pressure. This is part of XSA-345. Reported-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: Hongyan Xia <hongyxia@amazon.com> Signed-off-by: George Dunlap <george.dunlap@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com> --- xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 8ed3ecacbe..4ff24de73d 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -2153,6 +2153,50 @@ void page_unlock(struct page_info *page) current_locked_page_set(NULL); } +/* + * L3 table locks: + * + * Used for serialization in map_pages_to_xen() and modify_xen_mappings(). + * + * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to + * reuse the PGT_locked flag. This lock is taken only when we move down to L3 + * tables and below, since L4 (and above, for 5-level paging) is still globally + * protected by map_pgdir_lock. + * + * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock(). + * This has two implications: + * - We cannot reuse reuse current_locked_page_* for debugging + * - To avoid the chance of deadlock, even for different pages, we + * must never grab page_lock() after grabbing l3t_lock(). This + * includes any page_lock()-based locks, such as + * mem_sharing_page_lock(). + * + * Also note that we grab the map_pgdir_lock while holding the + * l3t_lock(), so to avoid deadlock we must avoid grabbing them in + * reverse order. + */ +static void l3t_lock(struct page_info *page) +{ + unsigned long x, nx; + + do { + while ( (x = page->u.inuse.type_info) & PGT_locked ) + cpu_relax(); + nx = x | PGT_locked; + } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); +} + +static void l3t_unlock(struct page_info *page) +{ + unsigned long x, nx, y = page->u.inuse.type_info; + + do { + x = y; + BUG_ON(!(x & PGT_locked)); + nx = x & ~PGT_locked; + } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); +} + /* * PTE flags that a guest may change without re-validating the PTE. * All other bits affect translation, caching, or Xen's safety. @@ -5184,6 +5228,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) flush_area_local((const void *)v, f) : \ flush_area_all((const void *)v, f)) +#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR + +#define L3T_LOCK(page) \ + do { \ + if ( locking ) \ + l3t_lock(page); \ + } while ( false ) + +#define L3T_UNLOCK(page) \ + do { \ + if ( locking && (page) != ZERO_BLOCK_PTR ) \ + { \ + l3t_unlock(page); \ + (page) = ZERO_BLOCK_PTR; \ + } \ + } while ( false ) + int map_pages_to_xen( unsigned long virt, mfn_t mfn, @@ -5195,6 +5256,7 @@ int map_pages_to_xen( l1_pgentry_t *pl1e, ol1e; unsigned int i; int rc = -ENOMEM; + struct page_info *current_l3page; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ @@ -5210,13 +5272,20 @@ int map_pages_to_xen( } \ } while (0) + L3T_INIT(current_l3page); + while ( nr_mfns != 0 ) { - l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); + l3_pgentry_t *pl3e, ol3e; + L3T_UNLOCK(current_l3page); + + pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) goto out; + current_l3page = virt_to_page(pl3e); + L3T_LOCK(current_l3page); ol3e = *pl3e; if ( cpu_has_page1gb && @@ -5550,6 +5619,7 @@ int map_pages_to_xen( rc = 0; out: + L3T_UNLOCK(current_l3page); return rc; } @@ -5578,6 +5648,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) unsigned int i; unsigned long v = s; int rc = -ENOMEM; + struct page_info *current_l3page; /* Set of valid PTE bits which may be altered. */ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) @@ -5586,11 +5657,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) ASSERT(IS_ALIGNED(s, PAGE_SIZE)); ASSERT(IS_ALIGNED(e, PAGE_SIZE)); + L3T_INIT(current_l3page); + while ( v < e ) { - l3_pgentry_t *pl3e = virt_to_xen_l3e(v); + l3_pgentry_t *pl3e; + + L3T_UNLOCK(current_l3page); - if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + pl3e = virt_to_xen_l3e(v); + if ( !pl3e ) + goto out; + + current_l3page = virt_to_page(pl3e); + L3T_LOCK(current_l3page); + + if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { /* Confirm the caller isn't trying to create new mappings. */ ASSERT(!(nf & _PAGE_PRESENT)); @@ -5808,9 +5890,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) rc = 0; out: + L3T_UNLOCK(current_l3page); return rc; } +#undef L3T_LOCK +#undef L3T_UNLOCK + #undef flush_area int destroy_xen_mappings(unsigned long s, unsigned long e) -- 2.25.1

$NetBSD: patch-XSA346,v 1.1 2020/10/21 09:03:05 bouyer Exp $ From: Jan Beulich <jbeulich@suse.com> Subject: IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page Deferring flushes to a single, wide range one - as is done when handling XENMAPSPACE_gmfn_range - is okay only as long as pages don't get freed ahead of the eventual flush. While the only function setting the flag (xenmem_add_to_physmap()) suggests by its name that it's only mapping new entries, in reality the way xenmem_add_to_physmap_one() works means an unmap would happen not only for the page being moved (but not freed) but, if the destination GFN is populated, also for the page being displaced from that GFN. Collapsing the two flushes for this GFN into just one (end even more so deferring it to a batched invocation) is not correct. This is part of XSA-346. Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: Julien Grall <jgrall@amazon.com> --- xen/common/memory.c.orig +++ xen/common/memory.c @@ -298,7 +298,10 @@ int guest_remove_page(struct domain *d, p2m_type_t p2mt; #endif mfn_t mfn; +#ifdef CONFIG_HAS_PASSTHROUGH + bool *dont_flush_p, dont_flush; int rc; +#endif #ifdef CONFIG_X86 mfn = get_gfn_query(d, gmfn, &p2mt); @@ -376,8 +379,22 @@ int guest_remove_page(struct domain *d, return -ENXIO; } +#ifdef CONFIG_HAS_PASSTHROUGH + /* + * Since we're likely to free the page below, we need to suspend + * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes. + */ + dont_flush_p = &this_cpu(iommu_dont_flush_iotlb); + dont_flush = *dont_flush_p; + *dont_flush_p = false; +#endif + rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0); +#ifdef CONFIG_HAS_PASSTHROUGH + *dont_flush_p = dont_flush; +#endif + /* * With the lack of an IOMMU on some platforms, domains with DMA-capable * device must retrieve the same pfn when the hypercall populate_physmap From: Jan Beulich <jbeulich@suse.com> Subject: IOMMU: hold page ref until after deferred TLB flush When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB flush for the "from" GFN range requires that the page remains allocated to the guest until the TLB flush has actually occurred. Otherwise a parallel hypercall to remove the page would only flush the TLB for the GFN it has been moved to, but not the one is was mapped at originally. This is part of XSA-346. Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") Reported-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Julien Grall <jgrall@amazon.com> --- xen/arch/arm/mm.c.orig +++ xen/arch/arm/mm.c @@ -1222,7 +1222,7 @@ void share_xen_page_with_guest(struct pa int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gfn) { @@ -1294,10 +1294,6 @@ int xenmem_add_to_physmap_one( break; } case XENMAPSPACE_dev_mmio: - /* extra should be 0. Reserved for future use. */ - if ( extra.res0 ) - return -EOPNOTSUPP; - rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx)); return rc; --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@ -4634,7 +4634,7 @@ static int handle_iomem_range(unsigned l int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gpfn) { @@ -4721,9 +4721,20 @@ int xenmem_add_to_physmap_one( rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); put_both: - /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */ + /* + * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. + * We also may need to transfer ownership of the page reference to our + * caller. + */ if ( space == XENMAPSPACE_gmfn ) + { put_gfn(d, gfn); + if ( !rc && extra.ppage ) + { + *extra.ppage = page; + page = NULL; + } + } if ( page ) put_page(page); --- xen/common/memory.c.orig +++ xen/common/memory.c @@ -811,11 +811,10 @@ int xenmem_add_to_physmap(struct domain { unsigned int done = 0; long rc = 0; - union xen_add_to_physmap_batch_extra extra; + union add_to_physmap_extra extra = {}; + struct page_info *pages[16]; - if ( xatp->space != XENMAPSPACE_gmfn_foreign ) - extra.res0 = 0; - else + if ( xatp->space == XENMAPSPACE_gmfn_foreign ) extra.foreign_domid = DOMID_INVALID; if ( xatp->space != XENMAPSPACE_gmfn_range ) @@ -831,7 +830,10 @@ int xenmem_add_to_physmap(struct domain #ifdef CONFIG_HAS_PASSTHROUGH if ( need_iommu(d) ) + { this_cpu(iommu_dont_flush_iotlb) = 1; + extra.ppage = &pages[0]; + } #endif while ( xatp->size > done ) @@ -844,8 +846,12 @@ int xenmem_add_to_physmap(struct domain xatp->idx++; xatp->gpfn++; + if ( extra.ppage ) + ++extra.ppage; + /* Check for continuation if it's not the last iteration. */ - if ( xatp->size > ++done && hypercall_preempt_check() ) + if ( (++done > ARRAY_SIZE(pages) && extra.ppage) || + (xatp->size > done && hypercall_preempt_check()) ) { rc = start + done; break; @@ -856,6 +862,7 @@ int xenmem_add_to_physmap(struct domain if ( need_iommu(d) ) { int ret; + unsigned int i; this_cpu(iommu_dont_flush_iotlb) = 0; @@ -863,6 +870,15 @@ int xenmem_add_to_physmap(struct domain if ( unlikely(ret) && rc >= 0 ) rc = ret; + /* + * Now that the IOMMU TLB flush was done for the original GFN, drop + * the page references. The 2nd flush below is fine to make later, as + * whoever removes the page again from its new GFN will have to do + * another flush anyway. + */ + for ( i = 0; i < done; ++i ) + put_page(pages[i]); + ret = iommu_iotlb_flush(d, xatp->gpfn - done, done); if ( unlikely(ret) && rc >= 0 ) rc = ret; @@ -876,6 +892,8 @@ static int xenmem_add_to_physmap_batch(s struct xen_add_to_physmap_batch *xatpb, unsigned int extent) { + union add_to_physmap_extra extra = {}; + if ( xatpb->size < extent ) return -EILSEQ; @@ -884,6 +902,19 @@ static int xenmem_add_to_physmap_batch(s !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) ) return -EFAULT; + switch ( xatpb->space ) + { + case XENMAPSPACE_dev_mmio: + /* res0 is reserved for future use. */ + if ( xatpb->u.res0 ) + return -EOPNOTSUPP; + break; + + case XENMAPSPACE_gmfn_foreign: + extra.foreign_domid = xatpb->u.foreign_domid; + break; + } + while ( xatpb->size > extent ) { xen_ulong_t idx; @@ -896,8 +927,7 @@ static int xenmem_add_to_physmap_batch(s extent, 1)) ) return -EFAULT; - rc = xenmem_add_to_physmap_one(d, xatpb->space, - xatpb->u, + rc = xenmem_add_to_physmap_one(d, xatpb->space, extra, idx, _gfn(gpfn)); if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) ) --- xen/include/xen/mm.h.orig +++ xen/include/xen/mm.h @@ -577,8 +577,22 @@ void scrub_one_page(struct page_info *); &(d)->xenpage_list : &(d)->page_list) #endif +union add_to_physmap_extra { + /* + * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs + * to be kept until after the flush, so the page can't get removed from + * the domain (and re-used for another purpose) beforehand. By passing + * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants + * to have ownership of such a reference transferred in the success case. + */ + struct page_info **ppage; + + /* XENMAPSPACE_gmfn_foreign */ + domid_t foreign_domid; +}; + int xenmem_add_to_physmap_one(struct domain *d, unsigned int space, - union xen_add_to_physmap_batch_extra extra, + union add_to_physmap_extra extra, unsigned long idx, gfn_t gfn); int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,

$NetBSD: patch-XSA347,v 1.1 2020/10/21 09:03:05 bouyer Exp $ From: Jan Beulich <jbeulich@suse.com> Subject: AMD/IOMMU: update live PTEs atomically Updating a live PTE word by word allows the IOMMU to see a partially updated entry. Construct the new entry fully in a local variable and then write the new entry by a single insn. This is part of XSA-347. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Paul Durrant <paul@xen.org> --- xen/drivers/passthrough/amd/iommu_map.c.orig +++ xen/drivers/passthrough/amd/iommu_map.c @@ -41,7 +41,7 @@ static void clear_iommu_pte_present(unsi table = map_domain_page(_mfn(l1_mfn)); pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1); - *pte = 0; + write_atomic(pte, 0); unmap_domain_page(table); } @@ -49,7 +49,7 @@ static bool_t set_iommu_pde_present(u32 unsigned int next_level, bool_t iw, bool_t ir) { - uint64_t addr_lo, addr_hi, maddr_next; + uint64_t addr_lo, addr_hi, maddr_next, full; u32 entry; bool need_flush = false, old_present; @@ -106,7 +106,7 @@ static bool_t set_iommu_pde_present(u32 if ( next_level == IOMMU_PAGING_MODE_LEVEL_0 ) set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_PTE_FC_MASK, IOMMU_PTE_FC_SHIFT, &entry); - pde[1] = entry; + full = (uint64_t)entry << 32; /* mark next level as 'present' */ set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, @@ -118,7 +118,9 @@ static bool_t set_iommu_pde_present(u32 set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT, &entry); - pde[0] = entry; + full |= entry; + + write_atomic((uint64_t *)pde, full); return need_flush; } From: Jan Beulich <jbeulich@suse.com> Subject: AMD/IOMMU: ensure suitable ordering of DTE modifications DMA and interrupt translation should be enabled only after other applicable DTE fields have been written. Similarly when disabling translation or when moving a device between domains, translation should first be disabled, before other entry fields get modified. Note however that the "moving" aspect doesn't apply to the interrupt remapping side, as domain specifics are maintained in the IRTEs here, not the DTE. We also never disable interrupt remapping once it got enabled for a device (the respective argument passed is always the immutable iommu_intremap). This is part of XSA-347. Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Paul Durrant <paul@xen.org> --- xen/drivers/passthrough/amd/iommu_map.c.orig +++ xen/drivers/passthrough/amd/iommu_map.c @@ -147,7 +147,22 @@ void amd_iommu_set_root_page_table( u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid) { u64 addr_hi, addr_lo; - u32 entry; + u32 entry, dte0 = dte[0]; + + if ( valid || + get_field_from_reg_u32(dte0, IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT) ) + { + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, dte0, + IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, + IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &dte0); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, dte0, + IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT, &dte0); + dte[0] = dte0; + smp_wmb(); + } + set_field_in_reg_u32(domain_id, 0, IOMMU_DEV_TABLE_DOMAIN_ID_MASK, IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry); @@ -166,8 +181,9 @@ void amd_iommu_set_root_page_table( IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK, IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry); dte[1] = entry; + smp_wmb(); - set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, dte0, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry); set_field_in_reg_u32(paging_mode, entry, @@ -180,7 +196,7 @@ void amd_iommu_set_root_page_table( IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_VALID_MASK, IOMMU_DEV_TABLE_VALID_SHIFT, &entry); - dte[0] = entry; + write_atomic(&dte[0], entry); } void iommu_dte_set_iotlb(u32 *dte, u8 i) @@ -212,6 +228,7 @@ void __init amd_iommu_set_intremap_table IOMMU_DEV_TABLE_INT_CONTROL_MASK, IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry); dte[5] = entry; + smp_wmb(); set_field_in_reg_u32((u32)addr_lo >> 6, 0, IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK, @@ -229,7 +246,7 @@ void __init amd_iommu_set_intremap_table IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_INT_VALID_MASK, IOMMU_DEV_TABLE_INT_VALID_SHIFT, &entry); - dte[4] = entry; + write_atomic(&dte[4], entry); } void __init iommu_dte_add_device_entry(u32 *dte, struct ivrs_mappings *ivrs_dev)