Tue Mar 13 16:45:52 2018 UTC ()
Mmh, add a missing x86_disable_intr(). My intention there was to ensure
interrupts were disabled before the barriers.


(maxv)
diff -r1.13 -r1.14 src/sys/arch/x86/x86/svs.c

cvs diff -r1.13 -r1.14 src/sys/arch/x86/x86/svs.c (switch to unified diff)

--- src/sys/arch/x86/x86/svs.c 2018/03/01 16:49:06 1.13
+++ src/sys/arch/x86/x86/svs.c 2018/03/13 16:45:52 1.14
@@ -1,752 +1,753 @@ @@ -1,752 +1,753 @@
1/* $NetBSD: svs.c,v 1.13 2018/03/01 16:49:06 maxv Exp $ */ 1/* $NetBSD: svs.c,v 1.14 2018/03/13 16:45:52 maxv Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2018 The NetBSD Foundation, Inc. 4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Maxime Villard. 8 * by Maxime Villard.
9 * 9 *
10 * Redistribution and use in source and binary forms, with or without 10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions 11 * modification, are permitted provided that the following conditions
12 * are met: 12 * are met:
13 * 1. Redistributions of source code must retain the above copyright 13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer. 14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright 15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the 16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution. 17 * documentation and/or other materials provided with the distribution.
18 * 18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE. 29 * POSSIBILITY OF SUCH DAMAGE.
30 */ 30 */
31 31
32#include <sys/cdefs.h> 32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.13 2018/03/01 16:49:06 maxv Exp $"); 33__KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.14 2018/03/13 16:45:52 maxv Exp $");
34 34
35#include "opt_svs.h" 35#include "opt_svs.h"
36 36
37#include <sys/param.h> 37#include <sys/param.h>
38#include <sys/systm.h> 38#include <sys/systm.h>
39#include <sys/proc.h> 39#include <sys/proc.h>
40#include <sys/cpu.h> 40#include <sys/cpu.h>
41#include <sys/sysctl.h> 41#include <sys/sysctl.h>
42#include <sys/xcall.h> 42#include <sys/xcall.h>
43 43
44#include <x86/cputypes.h> 44#include <x86/cputypes.h>
45#include <machine/cpuvar.h> 45#include <machine/cpuvar.h>
46#include <machine/frameasm.h> 46#include <machine/frameasm.h>
47 47
48#include <uvm/uvm.h> 48#include <uvm/uvm.h>
49#include <uvm/uvm_page.h> 49#include <uvm/uvm_page.h>
50 50
51/* 51/*
52 * Separate Virtual Space 52 * Separate Virtual Space
53 * 53 *
54 * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context 54 * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
55 * switch to a user pmap, the lower half of updirpa is populated with the 55 * switch to a user pmap, the lower half of updirpa is populated with the
56 * entries containing the userland pages. 56 * entries containing the userland pages.
57 * 57 *
58 * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 58 * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
59 *  59 *
60 * We use a special per-cpu page that we call UTLS, for User Thread Local 60 * We use a special per-cpu page that we call UTLS, for User Thread Local
61 * Storage. Each CPU has one UTLS page. This page has two VAs: 61 * Storage. Each CPU has one UTLS page. This page has two VAs:
62 * 62 *
63 * o When the user page tables are loaded in CR3, the VA to access this 63 * o When the user page tables are loaded in CR3, the VA to access this
64 * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is 64 * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
65 * _constant_ across CPUs, but in the user page tables this VA points to 65 * _constant_ across CPUs, but in the user page tables this VA points to
66 * the physical page of the UTLS that is _local_ to the CPU. 66 * the physical page of the UTLS that is _local_ to the CPU.
67 * 67 *
68 * o When the kernel page tables are loaded in CR3, the VA to access this 68 * o When the kernel page tables are loaded in CR3, the VA to access this
69 * page is ci->ci_svs_utls. 69 * page is ci->ci_svs_utls.
70 * 70 *
71 * +----------------------------------------------------------------------+ 71 * +----------------------------------------------------------------------+
72 * | CPU0 Local Data (Physical Page) | 72 * | CPU0 Local Data (Physical Page) |
73 * | +------------------+ +-------------+ | 73 * | +------------------+ +-------------+ |
74 * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | | 74 * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
75 * | +------------------+ +-------------+ | 75 * | +------------------+ +-------------+ |
76 * +-------------------------------------------------------------^--------+ 76 * +-------------------------------------------------------------^--------+
77 * | 77 * |
78 * +----------+ 78 * +----------+
79 * | 79 * |
80 * +----------------------------------------------------------------------+ | 80 * +----------------------------------------------------------------------+ |
81 * | CPU1 Local Data (Physical Page) | | 81 * | CPU1 Local Data (Physical Page) | |
82 * | +------------------+ +-------------+ | | 82 * | +------------------+ +-------------+ | |
83 * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | | 83 * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
84 * | +------------------+ +-------------+ | | 84 * | +------------------+ +-------------+ | |
85 * +-------------------------------------------------------------^--------+ | 85 * +-------------------------------------------------------------^--------+ |
86 * | | 86 * | |
87 * +------------------+ /----------------------+ | 87 * +------------------+ /----------------------+ |
88 * | Kern Page Tables | ci->ci_svs_utls | 88 * | Kern Page Tables | ci->ci_svs_utls |
89 * +------------------+ \---------------------------------+ 89 * +------------------+ \---------------------------------+
90 * 90 *
91 * The goal of the UTLS page is to provide an area where we can store whatever 91 * The goal of the UTLS page is to provide an area where we can store whatever
92 * we want, in a way that it is accessible both when the Kernel and when the 92 * we want, in a way that it is accessible both when the Kernel and when the
93 * User page tables are loaded in CR3. 93 * User page tables are loaded in CR3.
94 * 94 *
95 * We store in the UTLS page three 64bit values: 95 * We store in the UTLS page three 64bit values:
96 * 96 *
97 * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel 97 * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
98 * page tables. 98 * page tables.
99 * 99 *
100 * o UTLS_SCRATCH: a dummy place where we temporarily store a value during 100 * o UTLS_SCRATCH: a dummy place where we temporarily store a value during
101 * the syscall entry procedure. 101 * the syscall entry procedure.
102 * 102 *
103 * o UTLS_RSP0: the value we must put in RSP in order to have a stack where 103 * o UTLS_RSP0: the value we must put in RSP in order to have a stack where
104 * we can push the register states. This is used only during the syscall 104 * we can push the register states. This is used only during the syscall
105 * entry procedure, because there the CPU does not automatically switch 105 * entry procedure, because there the CPU does not automatically switch
106 * RSP (it does not use the TSS.rsp0 mechanism described below). 106 * RSP (it does not use the TSS.rsp0 mechanism described below).
107 * 107 *
108 * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~ 108 * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
109 * 109 *
110 * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between 110 * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
111 * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to 111 * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
112 * the stack of the new LWP. Then the execution continues. At some point, the 112 * the stack of the new LWP. Then the execution continues. At some point, the
113 * user LWP we context-switched to will perform a syscall or will receive an 113 * user LWP we context-switched to will perform a syscall or will receive an
114 * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a 114 * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
115 * stack. The kernel then pushes the register states on this stack, and 115 * stack. The kernel then pushes the register states on this stack, and
116 * executes in kernel mode normally. 116 * executes in kernel mode normally.
117 * 117 *
118 * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore, 118 * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
119 * when an interrupt is received while we were in kernel mode, the CPU does not 119 * when an interrupt is received while we were in kernel mode, the CPU does not
120 * read TSS.rsp0. Instead, it just uses the current stack. 120 * read TSS.rsp0. Instead, it just uses the current stack.
121 * 121 *
122 * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~ 122 * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
123 * 123 *
124 * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU 124 * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
125 * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do 125 * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
126 * _not_ have associated physical addresses. They are only two VAs. 126 * _not_ have associated physical addresses. They are only two VAs.
127 * 127 *
128 * The first page is unmapped and acts as a redzone. The second page is 128 * The first page is unmapped and acts as a redzone. The second page is
129 * dynamically kentered into the highest page of the real per-lwp kernel stack; 129 * dynamically kentered into the highest page of the real per-lwp kernel stack;
130 * but pay close attention, it is kentered _only_ in the user page tables. 130 * but pay close attention, it is kentered _only_ in the user page tables.
131 * That is to say, the VA of this second page is mapped when the user page 131 * That is to say, the VA of this second page is mapped when the user page
132 * tables are loaded, but not mapped when the kernel page tables are loaded. 132 * tables are loaded, but not mapped when the kernel page tables are loaded.
133 * 133 *
134 * During a context switch, svs_lwp_switch() gets called first. This function 134 * During a context switch, svs_lwp_switch() gets called first. This function
135 * does the kenter job described above, not in the kernel page tables (that 135 * does the kenter job described above, not in the kernel page tables (that
136 * are currently loaded), but in the user page tables (that are not loaded). 136 * are currently loaded), but in the user page tables (that are not loaded).
137 *  137 *
138 * VIRTUAL ADDRESSES PHYSICAL ADDRESSES 138 * VIRTUAL ADDRESSES PHYSICAL ADDRESSES
139 * 139 *
140 * +-----------------------------+ 140 * +-----------------------------+
141 * | KERNEL PAGE TABLES | 141 * | KERNEL PAGE TABLES |
142 * | +-------------------+ | +-------------------+ 142 * | +-------------------+ | +-------------------+
143 * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) | 143 * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
144 * | +-------------------+ | +-------------------+ 144 * | +-------------------+ | +-------------------+
145 * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) | 145 * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
146 * | +-------------------+ | +-------------------+ 146 * | +-------------------+ | +-------------------+
147 * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) | 147 * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
148 * | +-------------------+ | +-------------------+ 148 * | +-------------------+ | +-------------------+
149 * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) | 149 * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
150 * | +-------------------+ | +-> +-------------------+ 150 * | +-------------------+ | +-> +-------------------+
151 * +-----------------------------+ | 151 * +-----------------------------+ |
152 * | 152 * |
153 * +---------------------------------------+ | 153 * +---------------------------------------+ |
154 * | USER PAGE TABLES | | 154 * | USER PAGE TABLES | |
155 * | +----------------------------------+ | | 155 * | +----------------------------------+ | |
156 * | | pcpuarea->ent[cid].rsp0 (page 0) | | | 156 * | | pcpuarea->ent[cid].rsp0 (page 0) | | |
157 * | +----------------------------------+ | | 157 * | +----------------------------------+ | |
158 * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+ 158 * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
159 * | +----------------------------------+ | 159 * | +----------------------------------+ |
160 * +---------------------------------------+ 160 * +---------------------------------------+
161 * 161 *
162 * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1) 162 * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
163 * in TSS.rsp0. Later, when returning to userland on the lwp we context- 163 * in TSS.rsp0. Later, when returning to userland on the lwp we context-
164 * switched to, we will load the user page tables and execute in userland 164 * switched to, we will load the user page tables and execute in userland
165 * normally. 165 * normally.
166 * 166 *
167 * Next time an interrupt or syscall is received, the CPU will automatically 167 * Next time an interrupt or syscall is received, the CPU will automatically
168 * use TSS.rsp0 as a stack. Here it is executing with the user page tables 168 * use TSS.rsp0 as a stack. Here it is executing with the user page tables
169 * loaded, and therefore TSS.rsp0 is _mapped_. 169 * loaded, and therefore TSS.rsp0 is _mapped_.
170 * 170 *
171 * As part of the kernel entry procedure, we now switch CR3 to load the kernel 171 * As part of the kernel entry procedure, we now switch CR3 to load the kernel
172 * page tables. Here, we are still using the stack pointer we set in TSS.rsp0. 172 * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
173 * 173 *
174 * Remember that it was only one page of stack which was mapped only in the 174 * Remember that it was only one page of stack which was mapped only in the
175 * user page tables. We just switched to the kernel page tables, so we must 175 * user page tables. We just switched to the kernel page tables, so we must
176 * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so, 176 * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
177 * without touching the stack (since it is now unmapped, touching it would 177 * without touching the stack (since it is now unmapped, touching it would
178 * fault). 178 * fault).
179 * 179 *
180 * After we updated RSP, we can continue execution exactly as in the non-SVS 180 * After we updated RSP, we can continue execution exactly as in the non-SVS
181 * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if 181 * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
182 * we updated RSP to a totally different VA, this VA points to the same 182 * we updated RSP to a totally different VA, this VA points to the same
183 * physical page as TSS.rsp0. So in the end, the values the CPU pushed are 183 * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
184 * still here even with the new RSP. 184 * still here even with the new RSP.
185 * 185 *
186 * Thanks to this double-kenter optimization, we don't need to copy the 186 * Thanks to this double-kenter optimization, we don't need to copy the
187 * trapframe during each user<->kernel transition. 187 * trapframe during each user<->kernel transition.
188 * 188 *
189 * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 189 * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
190 * 190 *
191 * o Touching ci_svs_updir without holding ci_svs_mtx first is *not* 191 * o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
192 * allowed. 192 * allowed.
193 * 193 *
194 * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded 194 * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
195 * in their CR3 register. It must *not* be replaced by pm_cpus. 195 * in their CR3 register. It must *not* be replaced by pm_cpus.
196 * 196 *
197 * o When a context switch on the current CPU is made from a user LWP 197 * o When a context switch on the current CPU is made from a user LWP
198 * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's 198 * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
199 * pm_kernel_cpus still contains the current CPU. It implies that the 199 * pm_kernel_cpus still contains the current CPU. It implies that the
200 * remote CPUs that execute other threads of the user process we just 200 * remote CPUs that execute other threads of the user process we just
201 * left will keep synchronizing us against their changes. 201 * left will keep synchronizing us against their changes.
202 * 202 *
203 * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~ 203 * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
204 * 204 *
205 * o PTE Space 205 * o PTE Space
206 * o Direct Map 206 * o Direct Map
207 * o Remote PCPU Areas 207 * o Remote PCPU Areas
208 * o Kernel Heap 208 * o Kernel Heap
209 * o Kernel Image 209 * o Kernel Image
210 * 210 *
211 * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 211 * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
212 * 212 *
213 * Ordered from highest priority to lowest: 213 * Ordered from highest priority to lowest:
214 * 214 *
215 * o The NMI stack is not double-entered. Therefore if we ever receive an NMI 215 * o The NMI stack is not double-entered. Therefore if we ever receive an NMI
216 * and leave it, the content of the stack will be visible to userland (via 216 * and leave it, the content of the stack will be visible to userland (via
217 * Meltdown). Normally we never leave NMIs, unless a privileged user 217 * Meltdown). Normally we never leave NMIs, unless a privileged user
218 * launched PMCs. That's unlikely to happen, our PMC support is pretty 218 * launched PMCs. That's unlikely to happen, our PMC support is pretty
219 * minimal, and privileged only. 219 * minimal, and privileged only.
220 * 220 *
221 * o Narrow down the entry points: hide the 'jmp handler' instructions. This 221 * o Narrow down the entry points: hide the 'jmp handler' instructions. This
222 * makes sense on GENERIC_KASLR kernels. 222 * makes sense on GENERIC_KASLR kernels.
223 * 223 *
224 * o Right now there is only one global LDT, and that's not compatible with 224 * o Right now there is only one global LDT, and that's not compatible with
225 * USER_LDT. 225 * USER_LDT.
226 */ 226 */
227 227
228bool svs_enabled __read_mostly = false; 228bool svs_enabled __read_mostly = false;
229 229
230struct svs_utls { 230struct svs_utls {
231 paddr_t kpdirpa; 231 paddr_t kpdirpa;
232 uint64_t scratch; 232 uint64_t scratch;
233 vaddr_t rsp0; 233 vaddr_t rsp0;
234}; 234};
235 235
236static pd_entry_t * 236static pd_entry_t *
237svs_tree_add(struct cpu_info *ci, vaddr_t va) 237svs_tree_add(struct cpu_info *ci, vaddr_t va)
238{ 238{
239 extern const vaddr_t ptp_masks[]; 239 extern const vaddr_t ptp_masks[];
240 extern const int ptp_shifts[]; 240 extern const int ptp_shifts[];
241 extern const long nbpd[]; 241 extern const long nbpd[];
242 pd_entry_t *dstpde; 242 pd_entry_t *dstpde;
243 size_t i, pidx, mod; 243 size_t i, pidx, mod;
244 struct vm_page *pg; 244 struct vm_page *pg;
245 paddr_t pa; 245 paddr_t pa;
246 246
247 dstpde = ci->ci_svs_updir; 247 dstpde = ci->ci_svs_updir;
248 mod = (size_t)-1; 248 mod = (size_t)-1;
249 249
250 for (i = PTP_LEVELS; i > 1; i--) { 250 for (i = PTP_LEVELS; i > 1; i--) {
251 pidx = pl_i(va % mod, i); 251 pidx = pl_i(va % mod, i);
252 252
253 if (!pmap_valid_entry(dstpde[pidx])) { 253 if (!pmap_valid_entry(dstpde[pidx])) {
254 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 254 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
255 if (pg == 0) 255 if (pg == 0)
256 panic("%s: failed to allocate PA for CPU %d\n", 256 panic("%s: failed to allocate PA for CPU %d\n",
257 __func__, cpu_index(ci)); 257 __func__, cpu_index(ci));
258 pa = VM_PAGE_TO_PHYS(pg); 258 pa = VM_PAGE_TO_PHYS(pg);
259 259
260 dstpde[pidx] = PG_V | PG_RW | pa; 260 dstpde[pidx] = PG_V | PG_RW | pa;
261 } 261 }
262 262
263 pa = (paddr_t)(dstpde[pidx] & PG_FRAME); 263 pa = (paddr_t)(dstpde[pidx] & PG_FRAME);
264 dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa); 264 dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
265 mod = nbpd[i-1]; 265 mod = nbpd[i-1];
266 } 266 }
267 267
268 return dstpde; 268 return dstpde;
269} 269}
270 270
271static void 271static void
272svs_page_add(struct cpu_info *ci, vaddr_t va) 272svs_page_add(struct cpu_info *ci, vaddr_t va)
273{ 273{
274 pd_entry_t *srcpde, *dstpde, pde; 274 pd_entry_t *srcpde, *dstpde, pde;
275 size_t idx, pidx; 275 size_t idx, pidx;
276 paddr_t pa; 276 paddr_t pa;
277 277
278 /* Create levels L4, L3 and L2. */ 278 /* Create levels L4, L3 and L2. */
279 dstpde = svs_tree_add(ci, va); 279 dstpde = svs_tree_add(ci, va);
280 280
281 pidx = pl1_i(va % NBPD_L2); 281 pidx = pl1_i(va % NBPD_L2);
282 282
283 /* 283 /*
284 * If 'va' is in a large page, we need to compute its physical 284 * If 'va' is in a large page, we need to compute its physical
285 * address manually. 285 * address manually.
286 */ 286 */
287 idx = pl2_i(va); 287 idx = pl2_i(va);
288 srcpde = L2_BASE; 288 srcpde = L2_BASE;
289 if (!pmap_valid_entry(srcpde[idx])) { 289 if (!pmap_valid_entry(srcpde[idx])) {
290 panic("%s: L2 page not mapped", __func__); 290 panic("%s: L2 page not mapped", __func__);
291 } 291 }
292 if (srcpde[idx] & PG_PS) { 292 if (srcpde[idx] & PG_PS) {
293 pa = srcpde[idx] & PG_2MFRAME; 293 pa = srcpde[idx] & PG_2MFRAME;
294 pa += (paddr_t)(va % NBPD_L2); 294 pa += (paddr_t)(va % NBPD_L2);
295 pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa; 295 pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa;
296 296
297 if (pmap_valid_entry(dstpde[pidx])) { 297 if (pmap_valid_entry(dstpde[pidx])) {
298 panic("%s: L1 page already mapped", __func__); 298 panic("%s: L1 page already mapped", __func__);
299 } 299 }
300 dstpde[pidx] = pde; 300 dstpde[pidx] = pde;
301 return; 301 return;
302 } 302 }
303 303
304 /* 304 /*
305 * Normal page, just copy the PDE. 305 * Normal page, just copy the PDE.
306 */ 306 */
307 idx = pl1_i(va); 307 idx = pl1_i(va);
308 srcpde = L1_BASE; 308 srcpde = L1_BASE;
309 if (!pmap_valid_entry(srcpde[idx])) { 309 if (!pmap_valid_entry(srcpde[idx])) {
310 panic("%s: L1 page not mapped", __func__); 310 panic("%s: L1 page not mapped", __func__);
311 } 311 }
312 if (pmap_valid_entry(dstpde[pidx])) { 312 if (pmap_valid_entry(dstpde[pidx])) {
313 panic("%s: L1 page already mapped", __func__); 313 panic("%s: L1 page already mapped", __func__);
314 } 314 }
315 dstpde[pidx] = srcpde[idx] & ~(PG_G); 315 dstpde[pidx] = srcpde[idx] & ~(PG_G);
316} 316}
317 317
318static void 318static void
319svs_rsp0_init(struct cpu_info *ci) 319svs_rsp0_init(struct cpu_info *ci)
320{ 320{
321 const cpuid_t cid = cpu_index(ci); 321 const cpuid_t cid = cpu_index(ci);
322 vaddr_t va, rsp0; 322 vaddr_t va, rsp0;
323 pd_entry_t *pd; 323 pd_entry_t *pd;
324 size_t pidx; 324 size_t pidx;
325 325
326 rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0; 326 rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
327 327
328 /* The first page is a redzone. */ 328 /* The first page is a redzone. */
329 va = rsp0 + PAGE_SIZE; 329 va = rsp0 + PAGE_SIZE;
330 330
331 /* Create levels L4, L3 and L2. */ 331 /* Create levels L4, L3 and L2. */
332 pd = svs_tree_add(ci, va); 332 pd = svs_tree_add(ci, va);
333 333
334 /* Get the info for L1. */ 334 /* Get the info for L1. */
335 pidx = pl1_i(va % NBPD_L2); 335 pidx = pl1_i(va % NBPD_L2);
336 if (pmap_valid_entry(pd[pidx])) { 336 if (pmap_valid_entry(pd[pidx])) {
337 panic("%s: rsp0 page already mapped", __func__); 337 panic("%s: rsp0 page already mapped", __func__);
338 } 338 }
339 339
340 ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx]; 340 ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
341 ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe); 341 ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
342 ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe); 342 ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
343 ci->ci_svs_krsp0 = 0; 343 ci->ci_svs_krsp0 = 0;
344} 344}
345 345
346static void 346static void
347svs_utls_init(struct cpu_info *ci) 347svs_utls_init(struct cpu_info *ci)
348{ 348{
349 const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls; 349 const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
350 struct svs_utls *utls; 350 struct svs_utls *utls;
351 struct vm_page *pg; 351 struct vm_page *pg;
352 pd_entry_t *pd; 352 pd_entry_t *pd;
353 size_t pidx; 353 size_t pidx;
354 paddr_t pa; 354 paddr_t pa;
355 vaddr_t va; 355 vaddr_t va;
356 356
357 /* Create levels L4, L3 and L2 of the UTLS page. */ 357 /* Create levels L4, L3 and L2 of the UTLS page. */
358 pd = svs_tree_add(ci, utlsva); 358 pd = svs_tree_add(ci, utlsva);
359 359
360 /* Allocate L1. */ 360 /* Allocate L1. */
361 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 361 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
362 if (pg == 0) 362 if (pg == 0)
363 panic("%s: failed to allocate PA for CPU %d\n", __func__, 363 panic("%s: failed to allocate PA for CPU %d\n", __func__,
364 cpu_index(ci)); 364 cpu_index(ci));
365 pa = VM_PAGE_TO_PHYS(pg); 365 pa = VM_PAGE_TO_PHYS(pg);
366 366
367 /* Enter L1. */ 367 /* Enter L1. */
368 if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) { 368 if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
369 panic("%s: local page already mapped", __func__); 369 panic("%s: local page already mapped", __func__);
370 } 370 }
371 pidx = pl1_i(utlsva % NBPD_L2); 371 pidx = pl1_i(utlsva % NBPD_L2);
372 if (pmap_valid_entry(pd[pidx])) { 372 if (pmap_valid_entry(pd[pidx])) {
373 panic("%s: L1 page already mapped", __func__); 373 panic("%s: L1 page already mapped", __func__);
374 } 374 }
375 pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa; 375 pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa;
376 376
377 /* 377 /*
378 * Now, allocate a VA in the kernel map, that points to the UTLS 378 * Now, allocate a VA in the kernel map, that points to the UTLS
379 * page. After that, the UTLS page will be accessible in kernel 379 * page. After that, the UTLS page will be accessible in kernel
380 * mode via ci_svs_utls. 380 * mode via ci_svs_utls.
381 */ 381 */
382 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 382 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
383 UVM_KMF_VAONLY|UVM_KMF_NOWAIT); 383 UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
384 if (va == 0) { 384 if (va == 0) {
385 panic("%s: unable to allocate VA\n", __func__); 385 panic("%s: unable to allocate VA\n", __func__);
386 } 386 }
387 pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0); 387 pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
388 pmap_update(pmap_kernel()); 388 pmap_update(pmap_kernel());
389 389
390 ci->ci_svs_utls = va; 390 ci->ci_svs_utls = va;
391 391
392 /* Initialize the constant fields of the UTLS page */ 392 /* Initialize the constant fields of the UTLS page */
393 utls = (struct svs_utls *)ci->ci_svs_utls; 393 utls = (struct svs_utls *)ci->ci_svs_utls;
394 utls->rsp0 = ci->ci_svs_rsp0; 394 utls->rsp0 = ci->ci_svs_rsp0;
395} 395}
396 396
397static void 397static void
398svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size) 398svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size)
399{ 399{
400 size_t i, n; 400 size_t i, n;
401 401
402 KASSERT(size % PAGE_SIZE == 0); 402 KASSERT(size % PAGE_SIZE == 0);
403 n = size / PAGE_SIZE; 403 n = size / PAGE_SIZE;
404 for (i = 0; i < n; i++) { 404 for (i = 0; i < n; i++) {
405 svs_page_add(ci, va + i * PAGE_SIZE); 405 svs_page_add(ci, va + i * PAGE_SIZE);
406 } 406 }
407} 407}
408 408
409void 409void
410cpu_svs_init(struct cpu_info *ci) 410cpu_svs_init(struct cpu_info *ci)
411{ 411{
412 extern char __text_user_start; 412 extern char __text_user_start;
413 extern char __text_user_end; 413 extern char __text_user_end;
414 const cpuid_t cid = cpu_index(ci); 414 const cpuid_t cid = cpu_index(ci);
415 struct vm_page *pg; 415 struct vm_page *pg;
416 416
417 KASSERT(ci != NULL); 417 KASSERT(ci != NULL);
418 418
419 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 419 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
420 if (pg == 0) 420 if (pg == 0)
421 panic("%s: failed to allocate L4 PA for CPU %d\n", 421 panic("%s: failed to allocate L4 PA for CPU %d\n",
422 __func__, cpu_index(ci)); 422 __func__, cpu_index(ci));
423 ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg); 423 ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
424 424
425 ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 425 ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
426 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 426 UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
427 if (ci->ci_svs_updir == NULL) 427 if (ci->ci_svs_updir == NULL)
428 panic("%s: failed to allocate L4 VA for CPU %d\n", 428 panic("%s: failed to allocate L4 VA for CPU %d\n",
429 __func__, cpu_index(ci)); 429 __func__, cpu_index(ci));
430 430
431 pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa, 431 pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
432 VM_PROT_READ | VM_PROT_WRITE, 0); 432 VM_PROT_READ | VM_PROT_WRITE, 0);
433 433
434 pmap_update(pmap_kernel()); 434 pmap_update(pmap_kernel());
435 435
436 ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0); 436 ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0);
437 437
438 mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM); 438 mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
439 439
440 svs_page_add(ci, (vaddr_t)&pcpuarea->idt); 440 svs_page_add(ci, (vaddr_t)&pcpuarea->idt);
441 svs_page_add(ci, (vaddr_t)&pcpuarea->ldt); 441 svs_page_add(ci, (vaddr_t)&pcpuarea->ldt);
442 svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid], 442 svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
443 offsetof(struct pcpu_entry, rsp0)); 443 offsetof(struct pcpu_entry, rsp0));
444 svs_range_add(ci, (vaddr_t)&__text_user_start, 444 svs_range_add(ci, (vaddr_t)&__text_user_start,
445 (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start); 445 (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start);
446 446
447 svs_rsp0_init(ci); 447 svs_rsp0_init(ci);
448 svs_utls_init(ci); 448 svs_utls_init(ci);
449} 449}
450 450
451void 451void
452svs_pmap_sync(struct pmap *pmap, int index) 452svs_pmap_sync(struct pmap *pmap, int index)
453{ 453{
454 CPU_INFO_ITERATOR cii; 454 CPU_INFO_ITERATOR cii;
455 struct cpu_info *ci; 455 struct cpu_info *ci;
456 cpuid_t cid; 456 cpuid_t cid;
457 457
458 KASSERT(pmap != NULL); 458 KASSERT(pmap != NULL);
459 KASSERT(pmap != pmap_kernel()); 459 KASSERT(pmap != pmap_kernel());
460 KASSERT(mutex_owned(pmap->pm_lock)); 460 KASSERT(mutex_owned(pmap->pm_lock));
461 KASSERT(kpreempt_disabled()); 461 KASSERT(kpreempt_disabled());
462 KASSERT(index < 255); 462 KASSERT(index < 255);
463 463
464 for (CPU_INFO_FOREACH(cii, ci)) { 464 for (CPU_INFO_FOREACH(cii, ci)) {
465 cid = cpu_index(ci); 465 cid = cpu_index(ci);
466 466
467 if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) { 467 if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
468 continue; 468 continue;
469 } 469 }
470 470
471 /* take the lock and check again */ 471 /* take the lock and check again */
472 mutex_enter(&ci->ci_svs_mtx); 472 mutex_enter(&ci->ci_svs_mtx);
473 if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) { 473 if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
474 ci->ci_svs_updir[index] = pmap->pm_pdir[index]; 474 ci->ci_svs_updir[index] = pmap->pm_pdir[index];
475 } 475 }
476 mutex_exit(&ci->ci_svs_mtx); 476 mutex_exit(&ci->ci_svs_mtx);
477 } 477 }
478} 478}
479 479
480void 480void
481svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp) 481svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
482{ 482{
483 struct cpu_info *ci = curcpu(); 483 struct cpu_info *ci = curcpu();
484 struct svs_utls *utls; 484 struct svs_utls *utls;
485 struct pcb *pcb; 485 struct pcb *pcb;
486 pt_entry_t *pte; 486 pt_entry_t *pte;
487 uintptr_t rsp0; 487 uintptr_t rsp0;
488 vaddr_t va; 488 vaddr_t va;
489 489
490 if (newlwp->l_flag & LW_SYSTEM) { 490 if (newlwp->l_flag & LW_SYSTEM) {
491 return; 491 return;
492 } 492 }
493 493
494#ifdef DIAGNOSTIC 494#ifdef DIAGNOSTIC
495 if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) { 495 if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) {
496 pcb = lwp_getpcb(oldlwp); 496 pcb = lwp_getpcb(oldlwp);
497 rsp0 = pcb->pcb_rsp0; 497 rsp0 = pcb->pcb_rsp0;
498 va = rounddown(rsp0, PAGE_SIZE); 498 va = rounddown(rsp0, PAGE_SIZE);
499 KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe)); 499 KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
500 pte = ci->ci_svs_rsp0_pte; 500 pte = ci->ci_svs_rsp0_pte;
501 KASSERT(*pte == L1_BASE[pl1_i(va)]); 501 KASSERT(*pte == L1_BASE[pl1_i(va)]);
502 } 502 }
503#endif 503#endif
504 504
505 pcb = lwp_getpcb(newlwp); 505 pcb = lwp_getpcb(newlwp);
506 rsp0 = pcb->pcb_rsp0; 506 rsp0 = pcb->pcb_rsp0;
507 va = rounddown(rsp0, PAGE_SIZE); 507 va = rounddown(rsp0, PAGE_SIZE);
508 508
509 /* Update the kernel rsp0 in cpu_info */ 509 /* Update the kernel rsp0 in cpu_info */
510 ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe); 510 ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
511 KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) == 511 KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
512 (ci->ci_svs_ursp0 % PAGE_SIZE)); 512 (ci->ci_svs_ursp0 % PAGE_SIZE));
513 513
514 utls = (struct svs_utls *)ci->ci_svs_utls; 514 utls = (struct svs_utls *)ci->ci_svs_utls;
515 utls->scratch = 0; 515 utls->scratch = 0;
516 516
517 /* 517 /*
518 * Enter the user rsp0. We don't need to flush the TLB here, since 518 * Enter the user rsp0. We don't need to flush the TLB here, since
519 * the user page tables are not loaded. 519 * the user page tables are not loaded.
520 */ 520 */
521 pte = ci->ci_svs_rsp0_pte; 521 pte = ci->ci_svs_rsp0_pte;
522 *pte = L1_BASE[pl1_i(va)]; 522 *pte = L1_BASE[pl1_i(va)];
523} 523}
524 524
525static inline pt_entry_t 525static inline pt_entry_t
526svs_pte_atomic_read(struct pmap *pmap, size_t idx) 526svs_pte_atomic_read(struct pmap *pmap, size_t idx)
527{ 527{
528 /* 528 /*
529 * XXX: We don't have a basic atomic_fetch_64 function? 529 * XXX: We don't have a basic atomic_fetch_64 function?
530 */ 530 */
531 return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666); 531 return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666);
532} 532}
533 533
534/* 534/*
535 * We may come here with the pmap unlocked. So read its PTEs atomically. If 535 * We may come here with the pmap unlocked. So read its PTEs atomically. If
536 * a remote CPU is updating them at the same time, it's not a problem: the 536 * a remote CPU is updating them at the same time, it's not a problem: the
537 * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be 537 * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be
538 * synchronized properly. 538 * synchronized properly.
539 */ 539 */
540void 540void
541svs_pdir_switch(struct pmap *pmap) 541svs_pdir_switch(struct pmap *pmap)
542{ 542{
543 struct cpu_info *ci = curcpu(); 543 struct cpu_info *ci = curcpu();
544 struct svs_utls *utls; 544 struct svs_utls *utls;
545 pt_entry_t pte; 545 pt_entry_t pte;
546 size_t i; 546 size_t i;
547 547
548 KASSERT(kpreempt_disabled()); 548 KASSERT(kpreempt_disabled());
549 KASSERT(pmap != pmap_kernel()); 549 KASSERT(pmap != pmap_kernel());
550 550
551 ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0); 551 ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0);
552 552
553 /* Update the info in the UTLS page */ 553 /* Update the info in the UTLS page */
554 utls = (struct svs_utls *)ci->ci_svs_utls; 554 utls = (struct svs_utls *)ci->ci_svs_utls;
555 utls->kpdirpa = ci->ci_svs_kpdirpa; 555 utls->kpdirpa = ci->ci_svs_kpdirpa;
556 556
557 mutex_enter(&ci->ci_svs_mtx); 557 mutex_enter(&ci->ci_svs_mtx);
558 558
559 /* User slots. */ 559 /* User slots. */
560 for (i = 0; i < 255; i++) { 560 for (i = 0; i < 255; i++) {
561 pte = svs_pte_atomic_read(pmap, i); 561 pte = svs_pte_atomic_read(pmap, i);
562 ci->ci_svs_updir[i] = pte; 562 ci->ci_svs_updir[i] = pte;
563 } 563 }
564 564
565 mutex_exit(&ci->ci_svs_mtx); 565 mutex_exit(&ci->ci_svs_mtx);
566} 566}
567 567
568static void 568static void
569svs_enable(void) 569svs_enable(void)
570{ 570{
571 extern uint8_t svs_enter, svs_enter_end; 571 extern uint8_t svs_enter, svs_enter_end;
572 extern uint8_t svs_enter_altstack, svs_enter_altstack_end; 572 extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
573 extern uint8_t svs_leave, svs_leave_end; 573 extern uint8_t svs_leave, svs_leave_end;
574 extern uint8_t svs_leave_altstack, svs_leave_altstack_end; 574 extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
575 u_long psl, cr0; 575 u_long psl, cr0;
576 uint8_t *bytes; 576 uint8_t *bytes;
577 size_t size; 577 size_t size;
578 578
579 svs_enabled = true; 579 svs_enabled = true;
580 580
581 x86_patch_window_open(&psl, &cr0); 581 x86_patch_window_open(&psl, &cr0);
582 582
583 bytes = &svs_enter; 583 bytes = &svs_enter;
584 size = (size_t)&svs_enter_end - (size_t)&svs_enter; 584 size = (size_t)&svs_enter_end - (size_t)&svs_enter;
585 x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); 585 x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size);
586 586
587 bytes = &svs_enter_altstack; 587 bytes = &svs_enter_altstack;
588 size = (size_t)&svs_enter_altstack_end - 588 size = (size_t)&svs_enter_altstack_end -
589 (size_t)&svs_enter_altstack; 589 (size_t)&svs_enter_altstack;
590 x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); 590 x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size);
591 591
592 bytes = &svs_leave; 592 bytes = &svs_leave;
593 size = (size_t)&svs_leave_end - (size_t)&svs_leave; 593 size = (size_t)&svs_leave_end - (size_t)&svs_leave;
594 x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); 594 x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size);
595 595
596 bytes = &svs_leave_altstack; 596 bytes = &svs_leave_altstack;
597 size = (size_t)&svs_leave_altstack_end - 597 size = (size_t)&svs_leave_altstack_end -
598 (size_t)&svs_leave_altstack; 598 (size_t)&svs_leave_altstack;
599 x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); 599 x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size);
600 600
601 x86_patch_window_close(psl, cr0); 601 x86_patch_window_close(psl, cr0);
602} 602}
603 603
604static void 604static void
605svs_disable_hotpatch(void) 605svs_disable_hotpatch(void)
606{ 606{
607 extern uint8_t nosvs_enter, nosvs_enter_end; 607 extern uint8_t nosvs_enter, nosvs_enter_end;
608 extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end; 608 extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end;
609 extern uint8_t nosvs_leave, nosvs_leave_end; 609 extern uint8_t nosvs_leave, nosvs_leave_end;
610 extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end; 610 extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end;
611 u_long psl, cr0; 611 u_long psl, cr0;
612 uint8_t *bytes; 612 uint8_t *bytes;
613 size_t size; 613 size_t size;
614 614
615 x86_patch_window_open(&psl, &cr0); 615 x86_patch_window_open(&psl, &cr0);
616 616
617 bytes = &nosvs_enter; 617 bytes = &nosvs_enter;
618 size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter; 618 size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter;
619 x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); 619 x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size);
620 620
621 bytes = &nosvs_enter_altstack; 621 bytes = &nosvs_enter_altstack;
622 size = (size_t)&nosvs_enter_altstack_end - 622 size = (size_t)&nosvs_enter_altstack_end -
623 (size_t)&nosvs_enter_altstack; 623 (size_t)&nosvs_enter_altstack;
624 x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); 624 x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size);
625 625
626 bytes = &nosvs_leave; 626 bytes = &nosvs_leave;
627 size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave; 627 size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave;
628 x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); 628 x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size);
629 629
630 bytes = &nosvs_leave_altstack; 630 bytes = &nosvs_leave_altstack;
631 size = (size_t)&nosvs_leave_altstack_end - 631 size = (size_t)&nosvs_leave_altstack_end -
632 (size_t)&nosvs_leave_altstack; 632 (size_t)&nosvs_leave_altstack;
633 x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); 633 x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size);
634 634
635 x86_patch_window_close(psl, cr0); 635 x86_patch_window_close(psl, cr0);
636} 636}
637 637
638static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned; 638static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned;
639static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned; 639static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned;
640typedef void (vector)(void); 640typedef void (vector)(void);
641 641
642static void 642static void
643svs_disable_cpu(void *arg1, void *arg2) 643svs_disable_cpu(void *arg1, void *arg2)
644{ 644{
645 struct cpu_info *ci = curcpu(); 645 struct cpu_info *ci = curcpu();
646 extern vector Xsyscall; 646 extern vector Xsyscall;
647 u_long psl; 647 u_long psl;
648 648
649 psl = x86_read_psl(); 649 psl = x86_read_psl();
 650 x86_disable_intr();
650 651
651 atomic_dec_ulong(&svs_cpu_barrier1); 652 atomic_dec_ulong(&svs_cpu_barrier1);
652 while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) { 653 while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) {
653 x86_pause(); 654 x86_pause();
654 } 655 }
655 656
656 /* cpu0 is the one that does the hotpatch job */ 657 /* cpu0 is the one that does the hotpatch job */
657 if (ci == &cpu_info_primary) { 658 if (ci == &cpu_info_primary) {
658 svs_enabled = false; 659 svs_enabled = false;
659 svs_disable_hotpatch(); 660 svs_disable_hotpatch();
660 } 661 }
661 662
662 /* put back the non-SVS syscall entry point */ 663 /* put back the non-SVS syscall entry point */
663 wrmsr(MSR_LSTAR, (uint64_t)Xsyscall); 664 wrmsr(MSR_LSTAR, (uint64_t)Xsyscall);
664 665
665 /* enable global pages */ 666 /* enable global pages */
666 if (cpu_feature[0] & CPUID_PGE) 667 if (cpu_feature[0] & CPUID_PGE)
667 lcr4(rcr4() | CR4_PGE); 668 lcr4(rcr4() | CR4_PGE);
668 669
669 atomic_dec_ulong(&svs_cpu_barrier2); 670 atomic_dec_ulong(&svs_cpu_barrier2);
670 while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) { 671 while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) {
671 x86_pause(); 672 x86_pause();
672 } 673 }
673 674
674 /* Write back and invalidate cache, flush pipelines. */ 675 /* Write back and invalidate cache, flush pipelines. */
675 wbinvd(); 676 wbinvd();
676 x86_flush(); 677 x86_flush();
677 678
678 x86_write_psl(psl); 679 x86_write_psl(psl);
679} 680}
680 681
681static int 682static int
682svs_disable(void) 683svs_disable(void)
683{ 684{
684 struct cpu_info *ci = NULL; 685 struct cpu_info *ci = NULL;
685 CPU_INFO_ITERATOR cii; 686 CPU_INFO_ITERATOR cii;
686 uint64_t xc; 687 uint64_t xc;
687 688
688 mutex_enter(&cpu_lock); 689 mutex_enter(&cpu_lock);
689 690
690 /* 691 /*
691 * We expect all the CPUs to be online. 692 * We expect all the CPUs to be online.
692 */ 693 */
693 for (CPU_INFO_FOREACH(cii, ci)) { 694 for (CPU_INFO_FOREACH(cii, ci)) {
694 struct schedstate_percpu *spc = &ci->ci_schedstate; 695 struct schedstate_percpu *spc = &ci->ci_schedstate;
695 if (spc->spc_flags & SPCF_OFFLINE) { 696 if (spc->spc_flags & SPCF_OFFLINE) {
696 printf("[!] cpu%d offline, SVS not disabled\n", 697 printf("[!] cpu%d offline, SVS not disabled\n",
697 cpu_index(ci)); 698 cpu_index(ci));
698 mutex_exit(&cpu_lock); 699 mutex_exit(&cpu_lock);
699 return EOPNOTSUPP; 700 return EOPNOTSUPP;
700 } 701 }
701 } 702 }
702 703
703 svs_cpu_barrier1 = ncpu; 704 svs_cpu_barrier1 = ncpu;
704 svs_cpu_barrier2 = ncpu; 705 svs_cpu_barrier2 = ncpu;
705 706
706 printf("[+] Disabling SVS..."); 707 printf("[+] Disabling SVS...");
707 xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL); 708 xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL);
708 xc_wait(xc); 709 xc_wait(xc);
709 printf(" done!\n"); 710 printf(" done!\n");
710 711
711 mutex_exit(&cpu_lock); 712 mutex_exit(&cpu_lock);
712 713
713 return 0; 714 return 0;
714} 715}
715 716
716int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS); 717int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS);
717 718
718int 719int
719sysctl_machdep_svs_enabled(SYSCTLFN_ARGS) 720sysctl_machdep_svs_enabled(SYSCTLFN_ARGS)
720{ 721{
721 struct sysctlnode node; 722 struct sysctlnode node;
722 int error, val; 723 int error, val;
723 724
724 val = *(int *)rnode->sysctl_data; 725 val = *(int *)rnode->sysctl_data;
725 726
726 node = *rnode; 727 node = *rnode;
727 node.sysctl_data = &val; 728 node.sysctl_data = &val;
728 729
729 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 730 error = sysctl_lookup(SYSCTLFN_CALL(&node));
730 if (error != 0 || newp == NULL) 731 if (error != 0 || newp == NULL)
731 return error; 732 return error;
732 733
733 if (val == 1) { 734 if (val == 1) {
734 error = EINVAL; 735 error = EINVAL;
735 } else { 736 } else {
736 if (svs_enabled) 737 if (svs_enabled)
737 error = svs_disable(); 738 error = svs_disable();
738 else 739 else
739 error = 0; 740 error = 0;
740 } 741 }
741 742
742 return error; 743 return error;
743} 744}
744 745
745void 746void
746svs_init(void) 747svs_init(void)
747{ 748{
748 if (cpu_vendor != CPUVENDOR_INTEL) { 749 if (cpu_vendor != CPUVENDOR_INTEL) {
749 return; 750 return;
750 } 751 }
751 svs_enable(); 752 svs_enable();
752} 753}