Mon Mar 6 08:17:49 2017 UTC ()
Pull up following revision(s) (requested by bouyer in ticket #1441):
	sys/arch/x86/x86/pmap.c: revision 1.241 via patch
	sys/arch/x86/include/pmap.h: revision 1.63 via patch
Should be PG_k, doesn't change anything.
--
Remove PG_u from the kernel pages on Xen. Otherwise there is no privilege
separation between the kernel and userland.
On Xen-amd64, the kernel runs in ring3 just like userland, and the
separation is guaranteed by the hypervisor - each syscall/trap is
intercepted by Xen and sent manually to the kernel. Before that, the
hypervisor modifies the page tables so that the kernel becomes accessible.
Later, when returning to userland, the hypervisor removes the kernel pages
and flushes the TLB.
However, TLB flushes are costly, and in order to reduce the number of pages
flushed Xen marks the userland pages as global, while keeping the kernel
ones as local. This way, when returning to userland, only the kernel pages
get flushed - which makes sense since they are the only ones that got
removed from the mapping.
Xen differentiates the userland pages by looking at their PG_u bit in the
PTE; if a page has this bit then Xen tags it as global, otherwise Xen
manually adds the bit but keeps the page as local. The thing is, since we
set PG_u in the kernel pages, Xen believes our kernel pages are in fact
userland pages, so it marks them as global. Therefore, when returning to
userland, the kernel pages indeed get removed from the page tree, but are
not flushed from the TLB. Which means that they are still accessible.
With this - and depending on the DTLB size - userland has a small window
where it can read/write to the last kernel pages accessed, which is enough
to completely escalate privileges: the sysent structure systematically gets
read when performing a syscall, and chances are that it will still be
cached in the TLB. Userland can then use this to patch a chosen syscall,
make it point to a userland function, retrieve %gs and compute the address
of its credentials, and finally grant itself root privileges.


(snj)
diff -r1.49.2.2 -r1.49.2.2.4.1 src/sys/arch/x86/include/pmap.h
diff -r1.164.2.4.4.1 -r1.164.2.4.4.2 src/sys/arch/x86/x86/pmap.c

cvs diff -r1.49.2.2 -r1.49.2.2.4.1 src/sys/arch/x86/include/pmap.h (expand / switch to unified diff)

--- src/sys/arch/x86/include/pmap.h 2012/05/09 03:22:52 1.49.2.2
+++ src/sys/arch/x86/include/pmap.h 2017/03/06 08:17:49 1.49.2.2.4.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: pmap.h,v 1.49.2.2 2012/05/09 03:22:52 riz Exp $ */ 1/* $NetBSD: pmap.h,v 1.49.2.2.4.1 2017/03/06 08:17:49 snj Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 1997 Charles D. Cranor and Washington University. 4 * Copyright (c) 1997 Charles D. Cranor and Washington University.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -172,35 +172,27 @@ struct pmap { @@ -172,35 +172,27 @@ struct pmap {
172 uint64_t pm_ncsw; /* for assertions */ 172 uint64_t pm_ncsw; /* for assertions */
173 struct vm_page *pm_gc_ptp; /* pages from pmap g/c */ 173 struct vm_page *pm_gc_ptp; /* pages from pmap g/c */
174}; 174};
175 175
176/* macro to access pm_pdirpa slots */ 176/* macro to access pm_pdirpa slots */
177#ifdef PAE 177#ifdef PAE
178#define pmap_pdirpa(pmap, index) \ 178#define pmap_pdirpa(pmap, index) \
179 ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t)) 179 ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t))
180#else 180#else
181#define pmap_pdirpa(pmap, index) \ 181#define pmap_pdirpa(pmap, index) \
182 ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t)) 182 ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t))
183#endif 183#endif
184 184
185/*  
186 * flag to be used for kernel mappings: PG_u on Xen/amd64,  
187 * 0 otherwise. 
188 */ 
189#if defined(XEN) && defined(__x86_64__) 
190#define PG_k PG_u 
191#else 
192#define PG_k 0 185#define PG_k 0
193#endif 
194 186
195/* 187/*
196 * MD flags that we use for pmap_enter and pmap_kenter_pa: 188 * MD flags that we use for pmap_enter and pmap_kenter_pa:
197 */ 189 */
198 190
199/* 191/*
200 * global kernel variables 192 * global kernel variables
201 */ 193 */
202 194
203/* 195/*
204 * PDPpaddr is the physical address of the kernel's PDP. 196 * PDPpaddr is the physical address of the kernel's PDP.
205 * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3 197 * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3
206 * value associated to the kernel process, proc0. 198 * value associated to the kernel process, proc0.

cvs diff -r1.164.2.4.4.1 -r1.164.2.4.4.2 src/sys/arch/x86/x86/pmap.c (expand / switch to unified diff)

--- src/sys/arch/x86/x86/pmap.c 2016/07/14 07:09:39 1.164.2.4.4.1
+++ src/sys/arch/x86/x86/pmap.c 2017/03/06 08:17:49 1.164.2.4.4.2
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: pmap.c,v 1.164.2.4.4.1 2016/07/14 07:09:39 snj Exp $ */ 1/* $NetBSD: pmap.c,v 1.164.2.4.4.2 2017/03/06 08:17:49 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran. 8 * by Andrew Doran.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
@@ -161,27 +161,27 @@ @@ -161,27 +161,27 @@
161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162 * and David Greenman. 162 * and David Greenman.
163 * 163 *
164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
165 * between several processors. the VAX version was done by 165 * between several processors. the VAX version was done by
166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
168 * David Golub, and Richard Draves. the alpha version was 168 * David Golub, and Richard Draves. the alpha version was
169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170 * (NetBSD/alpha). 170 * (NetBSD/alpha).
171 */ 171 */
172 172
173#include <sys/cdefs.h> 173#include <sys/cdefs.h>
174__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.164.2.4.4.1 2016/07/14 07:09:39 snj Exp $"); 174__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.164.2.4.4.2 2017/03/06 08:17:49 snj Exp $");
175 175
176#include "opt_user_ldt.h" 176#include "opt_user_ldt.h"
177#include "opt_lockdebug.h" 177#include "opt_lockdebug.h"
178#include "opt_multiprocessor.h" 178#include "opt_multiprocessor.h"
179#include "opt_xen.h" 179#include "opt_xen.h"
180#if !defined(__x86_64__) 180#if !defined(__x86_64__)
181#include "opt_kstack_dr0.h" 181#include "opt_kstack_dr0.h"
182#endif /* !defined(__x86_64__) */ 182#endif /* !defined(__x86_64__) */
183 183
184#include <sys/param.h> 184#include <sys/param.h>
185#include <sys/systm.h> 185#include <sys/systm.h>
186#include <sys/proc.h> 186#include <sys/proc.h>
187#include <sys/pool.h> 187#include <sys/pool.h>
@@ -1457,27 +1457,27 @@ pmap_bootstrap(vaddr_t kva_start) @@ -1457,27 +1457,27 @@ pmap_bootstrap(vaddr_t kva_start)
1457 /* 1457 /*
1458 * We want a dummy page directory for Xen: 1458 * We want a dummy page directory for Xen:
1459 * when deactivate a pmap, Xen will still consider it active. 1459 * when deactivate a pmap, Xen will still consider it active.
1460 * So we set user PGD to this one to lift all protection on 1460 * So we set user PGD to this one to lift all protection on
1461 * the now inactive page tables set. 1461 * the now inactive page tables set.
1462 */ 1462 */
1463 xen_dummy_user_pgd = avail_start; 1463 xen_dummy_user_pgd = avail_start;
1464 avail_start += PAGE_SIZE; 1464 avail_start += PAGE_SIZE;
1465  1465
1466 /* Zero fill it, the less checks in Xen it requires the better */ 1466 /* Zero fill it, the less checks in Xen it requires the better */
1467 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1467 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1468 /* Mark read-only */ 1468 /* Mark read-only */
1469 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1469 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1470 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1470 pmap_pa2pte(xen_dummy_user_pgd) | PG_k | PG_V, UVMF_INVLPG);
1471 /* Pin as L4 */ 1471 /* Pin as L4 */
1472 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1472 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1473#endif /* __x86_64__ */ 1473#endif /* __x86_64__ */
1474 idt_vaddr = virtual_avail; /* don't need pte */ 1474 idt_vaddr = virtual_avail; /* don't need pte */
1475 idt_paddr = avail_start; /* steal a page */ 1475 idt_paddr = avail_start; /* steal a page */
1476 /* 1476 /*
1477 * Xen require one more page as we can't store 1477 * Xen require one more page as we can't store
1478 * GDT and LDT on the same page 1478 * GDT and LDT on the same page
1479 */ 1479 */
1480 virtual_avail += 3 * PAGE_SIZE; 1480 virtual_avail += 3 * PAGE_SIZE;
1481 avail_start += 3 * PAGE_SIZE; 1481 avail_start += 3 * PAGE_SIZE;
1482#else /* XEN */ 1482#else /* XEN */
1483 idt_vaddr = virtual_avail; /* don't need pte */ 1483 idt_vaddr = virtual_avail; /* don't need pte */
@@ -2054,27 +2054,27 @@ pmap_pdp_ctor(void *arg, void *v, int fl @@ -2054,27 +2054,27 @@ pmap_pdp_ctor(void *arg, void *v, int fl
2054 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2054 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2055 */ 2055 */
2056 2056
2057#if defined(XEN) && defined(__x86_64__) 2057#if defined(XEN) && defined(__x86_64__)
2058 /* fetch the physical address of the page directory. */ 2058 /* fetch the physical address of the page directory. */
2059 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2059 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2060 2060
2061 /* zero init area */ 2061 /* zero init area */
2062 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2062 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2063 /* 2063 /*
2064 * this pdir will NEVER be active in kernel mode 2064 * this pdir will NEVER be active in kernel mode
2065 * so mark recursive entry invalid 2065 * so mark recursive entry invalid
2066 */ 2066 */
2067 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2067 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_k;
2068 /* 2068 /*
2069 * PDP constructed this way won't be for kernel, 2069 * PDP constructed this way won't be for kernel,
2070 * hence we don't put kernel mappings on Xen. 2070 * hence we don't put kernel mappings on Xen.
2071 * But we need to make pmap_create() happy, so put a dummy (without 2071 * But we need to make pmap_create() happy, so put a dummy (without
2072 * PG_V) value at the right place. 2072 * PG_V) value at the right place.
2073 */ 2073 */
2074 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2074 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2075 (pd_entry_t)-1 & PG_FRAME; 2075 (pd_entry_t)-1 & PG_FRAME;
2076#else /* XEN && __x86_64__*/ 2076#else /* XEN && __x86_64__*/
2077 /* zero init area */ 2077 /* zero init area */
2078 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2078 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2079 2079
2080 object = (vaddr_t)v; 2080 object = (vaddr_t)v;