Mon Mar 6 03:32:46 2017 UTC ()
Pull up following revision(s) (requested by bouyer in ticket #1388):
	sys/arch/x86/include/pmap.h: revision 1.63 via patch
	sys/arch/x86/x86/pmap.c: revision 1.241 via patch
Should be PG_k, doesn't change anything.
--
Remove PG_u from the kernel pages on Xen. Otherwise there is no privilege
separation between the kernel and userland.
On Xen-amd64, the kernel runs in ring3 just like userland, and the
separation is guaranteed by the hypervisor - each syscall/trap is
intercepted by Xen and sent manually to the kernel. Before that, the
hypervisor modifies the page tables so that the kernel becomes accessible.
Later, when returning to userland, the hypervisor removes the kernel pages
and flushes the TLB.
However, TLB flushes are costly, and in order to reduce the number of pages
flushed Xen marks the userland pages as global, while keeping the kernel
ones as local. This way, when returning to userland, only the kernel pages
get flushed - which makes sense since they are the only ones that got
removed from the mapping.
Xen differentiates the userland pages by looking at their PG_u bit in the
PTE; if a page has this bit then Xen tags it as global, otherwise Xen
manually adds the bit but keeps the page as local. The thing is, since we
set PG_u in the kernel pages, Xen believes our kernel pages are in fact
userland pages, so it marks them as global. Therefore, when returning to
userland, the kernel pages indeed get removed from the page tree, but are
not flushed from the TLB. Which means that they are still accessible.
With this - and depending on the DTLB size - userland has a small window
where it can read/write to the last kernel pages accessed, which is enough
to completely escalate privileges: the sysent structure systematically gets
read when performing a syscall, and chances are that it will still be
cached in the TLB. Userland can then use this to patch a chosen syscall,
make it point to a userland function, retrieve %gs and compute the address
of its credentials, and finally grant itself root privileges.


(snj)
diff -r1.55.4.1.2.1 -r1.55.4.1.2.2 src/sys/arch/x86/include/pmap.h
diff -r1.183.2.2.2.2 -r1.183.2.2.2.3 src/sys/arch/x86/x86/pmap.c

cvs diff -r1.55.4.1.2.1 -r1.55.4.1.2.2 src/sys/arch/x86/include/pmap.h (expand / switch to unified diff)

--- src/sys/arch/x86/include/pmap.h 2016/12/18 07:02:59 1.55.4.1.2.1
+++ src/sys/arch/x86/include/pmap.h 2017/03/06 03:32:45 1.55.4.1.2.2
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: pmap.h,v 1.55.4.1.2.1 2016/12/18 07:02:59 snj Exp $ */ 1/* $NetBSD: pmap.h,v 1.55.4.1.2.2 2017/03/06 03:32:45 snj Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University. 4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -169,35 +169,27 @@ struct pmap { @@ -169,35 +169,27 @@ struct pmap {
169 uint64_t pm_ncsw; /* for assertions */ 169 uint64_t pm_ncsw; /* for assertions */
170 struct vm_page *pm_gc_ptp; /* pages from pmap g/c */ 170 struct vm_page *pm_gc_ptp; /* pages from pmap g/c */
171}; 171};
172 172
173/* macro to access pm_pdirpa slots */ 173/* macro to access pm_pdirpa slots */
174#ifdef PAE 174#ifdef PAE
175#define pmap_pdirpa(pmap, index) \ 175#define pmap_pdirpa(pmap, index) \
176 ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t)) 176 ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t))
177#else 177#else
178#define pmap_pdirpa(pmap, index) \ 178#define pmap_pdirpa(pmap, index) \
179 ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t)) 179 ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t))
180#endif 180#endif
181 181
182/*  
183 * flag to be used for kernel mappings: PG_u on Xen/amd64,  
184 * 0 otherwise. 
185 */ 
186#if defined(XEN) && defined(__x86_64__) 
187#define PG_k PG_u 
188#else 
189#define PG_k 0 182#define PG_k 0
190#endif 
191 183
192/* 184/*
193 * MD flags that we use for pmap_enter and pmap_kenter_pa: 185 * MD flags that we use for pmap_enter and pmap_kenter_pa:
194 */ 186 */
195 187
196/* 188/*
197 * global kernel variables 189 * global kernel variables
198 */ 190 */
199 191
200/* 192/*
201 * PDPpaddr is the physical address of the kernel's PDP. 193 * PDPpaddr is the physical address of the kernel's PDP.
202 * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3 194 * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3
203 * value associated to the kernel process, proc0. 195 * value associated to the kernel process, proc0.

cvs diff -r1.183.2.2.2.2 -r1.183.2.2.2.3 src/sys/arch/x86/x86/pmap.c (expand / switch to unified diff)

--- src/sys/arch/x86/x86/pmap.c 2016/12/18 07:02:59 1.183.2.2.2.2
+++ src/sys/arch/x86/x86/pmap.c 2017/03/06 03:32:45 1.183.2.2.2.3
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: pmap.c,v 1.183.2.2.2.2 2016/12/18 07:02:59 snj Exp $ */ 1/* $NetBSD: pmap.c,v 1.183.2.2.2.3 2017/03/06 03:32:45 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -161,27 +161,27 @@ @@ -161,27 +161,27 @@
161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162 * and David Greenman. 162 * and David Greenman.
163 * 163 *
164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
165 * between several processors. the VAX version was done by 165 * between several processors. the VAX version was done by
166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
168 * David Golub, and Richard Draves. the alpha version was 168 * David Golub, and Richard Draves. the alpha version was
169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170 * (NetBSD/alpha). 170 * (NetBSD/alpha).
171 */ 171 */
172 172
173#include <sys/cdefs.h> 173#include <sys/cdefs.h>
174__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.183.2.2.2.2 2016/12/18 07:02:59 snj Exp $"); 174__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.183.2.2.2.3 2017/03/06 03:32:45 snj Exp $");
175 175
176#include "opt_user_ldt.h" 176#include "opt_user_ldt.h"
177#include "opt_lockdebug.h" 177#include "opt_lockdebug.h"
178#include "opt_multiprocessor.h" 178#include "opt_multiprocessor.h"
179#include "opt_xen.h" 179#include "opt_xen.h"
180#if !defined(__x86_64__) 180#if !defined(__x86_64__)
181#include "opt_kstack_dr0.h" 181#include "opt_kstack_dr0.h"
182#endif /* !defined(__x86_64__) */ 182#endif /* !defined(__x86_64__) */
183 183
184#include <sys/param.h> 184#include <sys/param.h>
185#include <sys/systm.h> 185#include <sys/systm.h>
186#include <sys/proc.h> 186#include <sys/proc.h>
187#include <sys/pool.h> 187#include <sys/pool.h>
@@ -1597,27 +1597,27 @@ pmap_bootstrap(vaddr_t kva_start) @@ -1597,27 +1597,27 @@ pmap_bootstrap(vaddr_t kva_start)
1597 /* 1597 /*
1598 * We want a dummy page directory for Xen: 1598 * We want a dummy page directory for Xen:
1599 * when deactivate a pmap, Xen will still consider it active. 1599 * when deactivate a pmap, Xen will still consider it active.
1600 * So we set user PGD to this one to lift all protection on 1600 * So we set user PGD to this one to lift all protection on
1601 * the now inactive page tables set. 1601 * the now inactive page tables set.
1602 */ 1602 */
1603 xen_dummy_user_pgd = avail_start; 1603 xen_dummy_user_pgd = avail_start;
1604 avail_start += PAGE_SIZE; 1604 avail_start += PAGE_SIZE;
1605  1605
1606 /* Zero fill it, the less checks in Xen it requires the better */ 1606 /* Zero fill it, the less checks in Xen it requires the better */
1607 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1607 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1608 /* Mark read-only */ 1608 /* Mark read-only */
1609 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1609 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1610 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1610 pmap_pa2pte(xen_dummy_user_pgd) | PG_k | PG_V, UVMF_INVLPG);
1611 /* Pin as L4 */ 1611 /* Pin as L4 */
1612 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1612 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1613#endif /* __x86_64__ */ 1613#endif /* __x86_64__ */
1614 idt_vaddr = virtual_avail; /* don't need pte */ 1614 idt_vaddr = virtual_avail; /* don't need pte */
1615 idt_paddr = avail_start; /* steal a page */ 1615 idt_paddr = avail_start; /* steal a page */
1616 /* 1616 /*
1617 * Xen require one more page as we can't store 1617 * Xen require one more page as we can't store
1618 * GDT and LDT on the same page 1618 * GDT and LDT on the same page
1619 */ 1619 */
1620 virtual_avail += 3 * PAGE_SIZE; 1620 virtual_avail += 3 * PAGE_SIZE;
1621 avail_start += 3 * PAGE_SIZE; 1621 avail_start += 3 * PAGE_SIZE;
1622#else /* XEN */ 1622#else /* XEN */
1623 idt_vaddr = virtual_avail; /* don't need pte */ 1623 idt_vaddr = virtual_avail; /* don't need pte */
@@ -2190,27 +2190,27 @@ pmap_pdp_ctor(void *arg, void *v, int fl @@ -2190,27 +2190,27 @@ pmap_pdp_ctor(void *arg, void *v, int fl
2190 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2190 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2191 */ 2191 */
2192 2192
2193#if defined(XEN) && defined(__x86_64__) 2193#if defined(XEN) && defined(__x86_64__)
2194 /* fetch the physical address of the page directory. */ 2194 /* fetch the physical address of the page directory. */
2195 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2195 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2196 2196
2197 /* zero init area */ 2197 /* zero init area */
2198 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2198 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2199 /* 2199 /*
2200 * this pdir will NEVER be active in kernel mode 2200 * this pdir will NEVER be active in kernel mode
2201 * so mark recursive entry invalid 2201 * so mark recursive entry invalid
2202 */ 2202 */
2203 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2203 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_k;
2204 /* 2204 /*
2205 * PDP constructed this way won't be for kernel, 2205 * PDP constructed this way won't be for kernel,
2206 * hence we don't put kernel mappings on Xen. 2206 * hence we don't put kernel mappings on Xen.
2207 * But we need to make pmap_create() happy, so put a dummy (without 2207 * But we need to make pmap_create() happy, so put a dummy (without
2208 * PG_V) value at the right place. 2208 * PG_V) value at the right place.
2209 */ 2209 */
2210 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2210 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2211 (pd_entry_t)-1 & PG_FRAME; 2211 (pd_entry_t)-1 & PG_FRAME;
2212#else /* XEN && __x86_64__*/ 2212#else /* XEN && __x86_64__*/
2213 /* zero init area */ 2213 /* zero init area */
2214 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2214 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2215 2215
2216 object = (vaddr_t)v; 2216 object = (vaddr_t)v;