| @@ -1,752 +1,753 @@ | | | @@ -1,752 +1,753 @@ |
1 | /* $NetBSD: svs.c,v 1.13 2018/03/01 16:49:06 maxv Exp $ */ | | 1 | /* $NetBSD: svs.c,v 1.14 2018/03/13 16:45:52 maxv Exp $ */ |
2 | | | 2 | |
3 | /* | | 3 | /* |
4 | * Copyright (c) 2018 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2018 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation | | 7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Maxime Villard. | | 8 | * by Maxime Villard. |
9 | * | | 9 | * |
10 | * Redistribution and use in source and binary forms, with or without | | 10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions | | 11 | * modification, are permitted provided that the following conditions |
12 | * are met: | | 12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright | | 13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. | | 14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright | | 15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the | | 16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. | | 17 | * documentation and/or other materials provided with the distribution. |
18 | * | | 18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS | | 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | | 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. | | 29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ | | 30 | */ |
31 | | | 31 | |
32 | #include <sys/cdefs.h> | | 32 | #include <sys/cdefs.h> |
33 | __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.13 2018/03/01 16:49:06 maxv Exp $"); | | 33 | __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.14 2018/03/13 16:45:52 maxv Exp $"); |
34 | | | 34 | |
35 | #include "opt_svs.h" | | 35 | #include "opt_svs.h" |
36 | | | 36 | |
37 | #include <sys/param.h> | | 37 | #include <sys/param.h> |
38 | #include <sys/systm.h> | | 38 | #include <sys/systm.h> |
39 | #include <sys/proc.h> | | 39 | #include <sys/proc.h> |
40 | #include <sys/cpu.h> | | 40 | #include <sys/cpu.h> |
41 | #include <sys/sysctl.h> | | 41 | #include <sys/sysctl.h> |
42 | #include <sys/xcall.h> | | 42 | #include <sys/xcall.h> |
43 | | | 43 | |
44 | #include <x86/cputypes.h> | | 44 | #include <x86/cputypes.h> |
45 | #include <machine/cpuvar.h> | | 45 | #include <machine/cpuvar.h> |
46 | #include <machine/frameasm.h> | | 46 | #include <machine/frameasm.h> |
47 | | | 47 | |
48 | #include <uvm/uvm.h> | | 48 | #include <uvm/uvm.h> |
49 | #include <uvm/uvm_page.h> | | 49 | #include <uvm/uvm_page.h> |
50 | | | 50 | |
51 | /* | | 51 | /* |
52 | * Separate Virtual Space | | 52 | * Separate Virtual Space |
53 | * | | 53 | * |
54 | * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context | | 54 | * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context |
55 | * switch to a user pmap, the lower half of updirpa is populated with the | | 55 | * switch to a user pmap, the lower half of updirpa is populated with the |
56 | * entries containing the userland pages. | | 56 | * entries containing the userland pages. |
57 | * | | 57 | * |
58 | * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | 58 | * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
59 | * | | 59 | * |
60 | * We use a special per-cpu page that we call UTLS, for User Thread Local | | 60 | * We use a special per-cpu page that we call UTLS, for User Thread Local |
61 | * Storage. Each CPU has one UTLS page. This page has two VAs: | | 61 | * Storage. Each CPU has one UTLS page. This page has two VAs: |
62 | * | | 62 | * |
63 | * o When the user page tables are loaded in CR3, the VA to access this | | 63 | * o When the user page tables are loaded in CR3, the VA to access this |
64 | * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is | | 64 | * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is |
65 | * _constant_ across CPUs, but in the user page tables this VA points to | | 65 | * _constant_ across CPUs, but in the user page tables this VA points to |
66 | * the physical page of the UTLS that is _local_ to the CPU. | | 66 | * the physical page of the UTLS that is _local_ to the CPU. |
67 | * | | 67 | * |
68 | * o When the kernel page tables are loaded in CR3, the VA to access this | | 68 | * o When the kernel page tables are loaded in CR3, the VA to access this |
69 | * page is ci->ci_svs_utls. | | 69 | * page is ci->ci_svs_utls. |
70 | * | | 70 | * |
71 | * +----------------------------------------------------------------------+ | | 71 | * +----------------------------------------------------------------------+ |
72 | * | CPU0 Local Data (Physical Page) | | | 72 | * | CPU0 Local Data (Physical Page) | |
73 | * | +------------------+ +-------------+ | | | 73 | * | +------------------+ +-------------+ | |
74 | * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | | | | 74 | * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | | |
75 | * | +------------------+ +-------------+ | | | 75 | * | +------------------+ +-------------+ | |
76 | * +-------------------------------------------------------------^--------+ | | 76 | * +-------------------------------------------------------------^--------+ |
77 | * | | | 77 | * | |
78 | * +----------+ | | 78 | * +----------+ |
79 | * | | | 79 | * | |
80 | * +----------------------------------------------------------------------+ | | | 80 | * +----------------------------------------------------------------------+ | |
81 | * | CPU1 Local Data (Physical Page) | | | | 81 | * | CPU1 Local Data (Physical Page) | | |
82 | * | +------------------+ +-------------+ | | | | 82 | * | +------------------+ +-------------+ | | |
83 | * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | | | | 83 | * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | | |
84 | * | +------------------+ +-------------+ | | | | 84 | * | +------------------+ +-------------+ | | |
85 | * +-------------------------------------------------------------^--------+ | | | 85 | * +-------------------------------------------------------------^--------+ | |
86 | * | | | | 86 | * | | |
87 | * +------------------+ /----------------------+ | | | 87 | * +------------------+ /----------------------+ | |
88 | * | Kern Page Tables | ci->ci_svs_utls | | | 88 | * | Kern Page Tables | ci->ci_svs_utls | |
89 | * +------------------+ \---------------------------------+ | | 89 | * +------------------+ \---------------------------------+ |
90 | * | | 90 | * |
91 | * The goal of the UTLS page is to provide an area where we can store whatever | | 91 | * The goal of the UTLS page is to provide an area where we can store whatever |
92 | * we want, in a way that it is accessible both when the Kernel and when the | | 92 | * we want, in a way that it is accessible both when the Kernel and when the |
93 | * User page tables are loaded in CR3. | | 93 | * User page tables are loaded in CR3. |
94 | * | | 94 | * |
95 | * We store in the UTLS page three 64bit values: | | 95 | * We store in the UTLS page three 64bit values: |
96 | * | | 96 | * |
97 | * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel | | 97 | * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel |
98 | * page tables. | | 98 | * page tables. |
99 | * | | 99 | * |
100 | * o UTLS_SCRATCH: a dummy place where we temporarily store a value during | | 100 | * o UTLS_SCRATCH: a dummy place where we temporarily store a value during |
101 | * the syscall entry procedure. | | 101 | * the syscall entry procedure. |
102 | * | | 102 | * |
103 | * o UTLS_RSP0: the value we must put in RSP in order to have a stack where | | 103 | * o UTLS_RSP0: the value we must put in RSP in order to have a stack where |
104 | * we can push the register states. This is used only during the syscall | | 104 | * we can push the register states. This is used only during the syscall |
105 | * entry procedure, because there the CPU does not automatically switch | | 105 | * entry procedure, because there the CPU does not automatically switch |
106 | * RSP (it does not use the TSS.rsp0 mechanism described below). | | 106 | * RSP (it does not use the TSS.rsp0 mechanism described below). |
107 | * | | 107 | * |
108 | * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~ | | 108 | * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~ |
109 | * | | 109 | * |
110 | * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between | | 110 | * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between |
111 | * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to | | 111 | * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to |
112 | * the stack of the new LWP. Then the execution continues. At some point, the | | 112 | * the stack of the new LWP. Then the execution continues. At some point, the |
113 | * user LWP we context-switched to will perform a syscall or will receive an | | 113 | * user LWP we context-switched to will perform a syscall or will receive an |
114 | * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a | | 114 | * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a |
115 | * stack. The kernel then pushes the register states on this stack, and | | 115 | * stack. The kernel then pushes the register states on this stack, and |
116 | * executes in kernel mode normally. | | 116 | * executes in kernel mode normally. |
117 | * | | 117 | * |
118 | * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore, | | 118 | * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore, |
119 | * when an interrupt is received while we were in kernel mode, the CPU does not | | 119 | * when an interrupt is received while we were in kernel mode, the CPU does not |
120 | * read TSS.rsp0. Instead, it just uses the current stack. | | 120 | * read TSS.rsp0. Instead, it just uses the current stack. |
121 | * | | 121 | * |
122 | * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~ | | 122 | * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~ |
123 | * | | 123 | * |
124 | * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU | | 124 | * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU |
125 | * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do | | 125 | * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do |
126 | * _not_ have associated physical addresses. They are only two VAs. | | 126 | * _not_ have associated physical addresses. They are only two VAs. |
127 | * | | 127 | * |
128 | * The first page is unmapped and acts as a redzone. The second page is | | 128 | * The first page is unmapped and acts as a redzone. The second page is |
129 | * dynamically kentered into the highest page of the real per-lwp kernel stack; | | 129 | * dynamically kentered into the highest page of the real per-lwp kernel stack; |
130 | * but pay close attention, it is kentered _only_ in the user page tables. | | 130 | * but pay close attention, it is kentered _only_ in the user page tables. |
131 | * That is to say, the VA of this second page is mapped when the user page | | 131 | * That is to say, the VA of this second page is mapped when the user page |
132 | * tables are loaded, but not mapped when the kernel page tables are loaded. | | 132 | * tables are loaded, but not mapped when the kernel page tables are loaded. |
133 | * | | 133 | * |
134 | * During a context switch, svs_lwp_switch() gets called first. This function | | 134 | * During a context switch, svs_lwp_switch() gets called first. This function |
135 | * does the kenter job described above, not in the kernel page tables (that | | 135 | * does the kenter job described above, not in the kernel page tables (that |
136 | * are currently loaded), but in the user page tables (that are not loaded). | | 136 | * are currently loaded), but in the user page tables (that are not loaded). |
137 | * | | 137 | * |
138 | * VIRTUAL ADDRESSES PHYSICAL ADDRESSES | | 138 | * VIRTUAL ADDRESSES PHYSICAL ADDRESSES |
139 | * | | 139 | * |
140 | * +-----------------------------+ | | 140 | * +-----------------------------+ |
141 | * | KERNEL PAGE TABLES | | | 141 | * | KERNEL PAGE TABLES | |
142 | * | +-------------------+ | +-------------------+ | | 142 | * | +-------------------+ | +-------------------+ |
143 | * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) | | | 143 | * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) | |
144 | * | +-------------------+ | +-------------------+ | | 144 | * | +-------------------+ | +-------------------+ |
145 | * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) | | | 145 | * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) | |
146 | * | +-------------------+ | +-------------------+ | | 146 | * | +-------------------+ | +-------------------+ |
147 | * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) | | | 147 | * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) | |
148 | * | +-------------------+ | +-------------------+ | | 148 | * | +-------------------+ | +-------------------+ |
149 | * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) | | | 149 | * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) | |
150 | * | +-------------------+ | +-> +-------------------+ | | 150 | * | +-------------------+ | +-> +-------------------+ |
151 | * +-----------------------------+ | | | 151 | * +-----------------------------+ | |
152 | * | | | 152 | * | |
153 | * +---------------------------------------+ | | | 153 | * +---------------------------------------+ | |
154 | * | USER PAGE TABLES | | | | 154 | * | USER PAGE TABLES | | |
155 | * | +----------------------------------+ | | | | 155 | * | +----------------------------------+ | | |
156 | * | | pcpuarea->ent[cid].rsp0 (page 0) | | | | | 156 | * | | pcpuarea->ent[cid].rsp0 (page 0) | | | |
157 | * | +----------------------------------+ | | | | 157 | * | +----------------------------------+ | | |
158 | * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+ | | 158 | * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+ |
159 | * | +----------------------------------+ | | | 159 | * | +----------------------------------+ | |
160 | * +---------------------------------------+ | | 160 | * +---------------------------------------+ |
161 | * | | 161 | * |
162 | * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1) | | 162 | * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1) |
163 | * in TSS.rsp0. Later, when returning to userland on the lwp we context- | | 163 | * in TSS.rsp0. Later, when returning to userland on the lwp we context- |
164 | * switched to, we will load the user page tables and execute in userland | | 164 | * switched to, we will load the user page tables and execute in userland |
165 | * normally. | | 165 | * normally. |
166 | * | | 166 | * |
167 | * Next time an interrupt or syscall is received, the CPU will automatically | | 167 | * Next time an interrupt or syscall is received, the CPU will automatically |
168 | * use TSS.rsp0 as a stack. Here it is executing with the user page tables | | 168 | * use TSS.rsp0 as a stack. Here it is executing with the user page tables |
169 | * loaded, and therefore TSS.rsp0 is _mapped_. | | 169 | * loaded, and therefore TSS.rsp0 is _mapped_. |
170 | * | | 170 | * |
171 | * As part of the kernel entry procedure, we now switch CR3 to load the kernel | | 171 | * As part of the kernel entry procedure, we now switch CR3 to load the kernel |
172 | * page tables. Here, we are still using the stack pointer we set in TSS.rsp0. | | 172 | * page tables. Here, we are still using the stack pointer we set in TSS.rsp0. |
173 | * | | 173 | * |
174 | * Remember that it was only one page of stack which was mapped only in the | | 174 | * Remember that it was only one page of stack which was mapped only in the |
175 | * user page tables. We just switched to the kernel page tables, so we must | | 175 | * user page tables. We just switched to the kernel page tables, so we must |
176 | * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so, | | 176 | * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so, |
177 | * without touching the stack (since it is now unmapped, touching it would | | 177 | * without touching the stack (since it is now unmapped, touching it would |
178 | * fault). | | 178 | * fault). |
179 | * | | 179 | * |
180 | * After we updated RSP, we can continue execution exactly as in the non-SVS | | 180 | * After we updated RSP, we can continue execution exactly as in the non-SVS |
181 | * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if | | 181 | * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if |
182 | * we updated RSP to a totally different VA, this VA points to the same | | 182 | * we updated RSP to a totally different VA, this VA points to the same |
183 | * physical page as TSS.rsp0. So in the end, the values the CPU pushed are | | 183 | * physical page as TSS.rsp0. So in the end, the values the CPU pushed are |
184 | * still here even with the new RSP. | | 184 | * still here even with the new RSP. |
185 | * | | 185 | * |
186 | * Thanks to this double-kenter optimization, we don't need to copy the | | 186 | * Thanks to this double-kenter optimization, we don't need to copy the |
187 | * trapframe during each user<->kernel transition. | | 187 | * trapframe during each user<->kernel transition. |
188 | * | | 188 | * |
189 | * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | 189 | * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
190 | * | | 190 | * |
191 | * o Touching ci_svs_updir without holding ci_svs_mtx first is *not* | | 191 | * o Touching ci_svs_updir without holding ci_svs_mtx first is *not* |
192 | * allowed. | | 192 | * allowed. |
193 | * | | 193 | * |
194 | * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded | | 194 | * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded |
195 | * in their CR3 register. It must *not* be replaced by pm_cpus. | | 195 | * in their CR3 register. It must *not* be replaced by pm_cpus. |
196 | * | | 196 | * |
197 | * o When a context switch on the current CPU is made from a user LWP | | 197 | * o When a context switch on the current CPU is made from a user LWP |
198 | * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's | | 198 | * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's |
199 | * pm_kernel_cpus still contains the current CPU. It implies that the | | 199 | * pm_kernel_cpus still contains the current CPU. It implies that the |
200 | * remote CPUs that execute other threads of the user process we just | | 200 | * remote CPUs that execute other threads of the user process we just |
201 | * left will keep synchronizing us against their changes. | | 201 | * left will keep synchronizing us against their changes. |
202 | * | | 202 | * |
203 | * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~ | | 203 | * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~ |
204 | * | | 204 | * |
205 | * o PTE Space | | 205 | * o PTE Space |
206 | * o Direct Map | | 206 | * o Direct Map |
207 | * o Remote PCPU Areas | | 207 | * o Remote PCPU Areas |
208 | * o Kernel Heap | | 208 | * o Kernel Heap |
209 | * o Kernel Image | | 209 | * o Kernel Image |
210 | * | | 210 | * |
211 | * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | 211 | * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
212 | * | | 212 | * |
213 | * Ordered from highest priority to lowest: | | 213 | * Ordered from highest priority to lowest: |
214 | * | | 214 | * |
215 | * o The NMI stack is not double-entered. Therefore if we ever receive an NMI | | 215 | * o The NMI stack is not double-entered. Therefore if we ever receive an NMI |
216 | * and leave it, the content of the stack will be visible to userland (via | | 216 | * and leave it, the content of the stack will be visible to userland (via |
217 | * Meltdown). Normally we never leave NMIs, unless a privileged user | | 217 | * Meltdown). Normally we never leave NMIs, unless a privileged user |
218 | * launched PMCs. That's unlikely to happen, our PMC support is pretty | | 218 | * launched PMCs. That's unlikely to happen, our PMC support is pretty |
219 | * minimal, and privileged only. | | 219 | * minimal, and privileged only. |
220 | * | | 220 | * |
221 | * o Narrow down the entry points: hide the 'jmp handler' instructions. This | | 221 | * o Narrow down the entry points: hide the 'jmp handler' instructions. This |
222 | * makes sense on GENERIC_KASLR kernels. | | 222 | * makes sense on GENERIC_KASLR kernels. |
223 | * | | 223 | * |
224 | * o Right now there is only one global LDT, and that's not compatible with | | 224 | * o Right now there is only one global LDT, and that's not compatible with |
225 | * USER_LDT. | | 225 | * USER_LDT. |
226 | */ | | 226 | */ |
227 | | | 227 | |
228 | bool svs_enabled __read_mostly = false; | | 228 | bool svs_enabled __read_mostly = false; |
229 | | | 229 | |
230 | struct svs_utls { | | 230 | struct svs_utls { |
231 | paddr_t kpdirpa; | | 231 | paddr_t kpdirpa; |
232 | uint64_t scratch; | | 232 | uint64_t scratch; |
233 | vaddr_t rsp0; | | 233 | vaddr_t rsp0; |
234 | }; | | 234 | }; |
235 | | | 235 | |
236 | static pd_entry_t * | | 236 | static pd_entry_t * |
237 | svs_tree_add(struct cpu_info *ci, vaddr_t va) | | 237 | svs_tree_add(struct cpu_info *ci, vaddr_t va) |
238 | { | | 238 | { |
239 | extern const vaddr_t ptp_masks[]; | | 239 | extern const vaddr_t ptp_masks[]; |
240 | extern const int ptp_shifts[]; | | 240 | extern const int ptp_shifts[]; |
241 | extern const long nbpd[]; | | 241 | extern const long nbpd[]; |
242 | pd_entry_t *dstpde; | | 242 | pd_entry_t *dstpde; |
243 | size_t i, pidx, mod; | | 243 | size_t i, pidx, mod; |
244 | struct vm_page *pg; | | 244 | struct vm_page *pg; |
245 | paddr_t pa; | | 245 | paddr_t pa; |
246 | | | 246 | |
247 | dstpde = ci->ci_svs_updir; | | 247 | dstpde = ci->ci_svs_updir; |
248 | mod = (size_t)-1; | | 248 | mod = (size_t)-1; |
249 | | | 249 | |
250 | for (i = PTP_LEVELS; i > 1; i--) { | | 250 | for (i = PTP_LEVELS; i > 1; i--) { |
251 | pidx = pl_i(va % mod, i); | | 251 | pidx = pl_i(va % mod, i); |
252 | | | 252 | |
253 | if (!pmap_valid_entry(dstpde[pidx])) { | | 253 | if (!pmap_valid_entry(dstpde[pidx])) { |
254 | pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); | | 254 | pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); |
255 | if (pg == 0) | | 255 | if (pg == 0) |
256 | panic("%s: failed to allocate PA for CPU %d\n", | | 256 | panic("%s: failed to allocate PA for CPU %d\n", |
257 | __func__, cpu_index(ci)); | | 257 | __func__, cpu_index(ci)); |
258 | pa = VM_PAGE_TO_PHYS(pg); | | 258 | pa = VM_PAGE_TO_PHYS(pg); |
259 | | | 259 | |
260 | dstpde[pidx] = PG_V | PG_RW | pa; | | 260 | dstpde[pidx] = PG_V | PG_RW | pa; |
261 | } | | 261 | } |
262 | | | 262 | |
263 | pa = (paddr_t)(dstpde[pidx] & PG_FRAME); | | 263 | pa = (paddr_t)(dstpde[pidx] & PG_FRAME); |
264 | dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa); | | 264 | dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa); |
265 | mod = nbpd[i-1]; | | 265 | mod = nbpd[i-1]; |
266 | } | | 266 | } |
267 | | | 267 | |
268 | return dstpde; | | 268 | return dstpde; |
269 | } | | 269 | } |
270 | | | 270 | |
271 | static void | | 271 | static void |
272 | svs_page_add(struct cpu_info *ci, vaddr_t va) | | 272 | svs_page_add(struct cpu_info *ci, vaddr_t va) |
273 | { | | 273 | { |
274 | pd_entry_t *srcpde, *dstpde, pde; | | 274 | pd_entry_t *srcpde, *dstpde, pde; |
275 | size_t idx, pidx; | | 275 | size_t idx, pidx; |
276 | paddr_t pa; | | 276 | paddr_t pa; |
277 | | | 277 | |
278 | /* Create levels L4, L3 and L2. */ | | 278 | /* Create levels L4, L3 and L2. */ |
279 | dstpde = svs_tree_add(ci, va); | | 279 | dstpde = svs_tree_add(ci, va); |
280 | | | 280 | |
281 | pidx = pl1_i(va % NBPD_L2); | | 281 | pidx = pl1_i(va % NBPD_L2); |
282 | | | 282 | |
283 | /* | | 283 | /* |
284 | * If 'va' is in a large page, we need to compute its physical | | 284 | * If 'va' is in a large page, we need to compute its physical |
285 | * address manually. | | 285 | * address manually. |
286 | */ | | 286 | */ |
287 | idx = pl2_i(va); | | 287 | idx = pl2_i(va); |
288 | srcpde = L2_BASE; | | 288 | srcpde = L2_BASE; |
289 | if (!pmap_valid_entry(srcpde[idx])) { | | 289 | if (!pmap_valid_entry(srcpde[idx])) { |
290 | panic("%s: L2 page not mapped", __func__); | | 290 | panic("%s: L2 page not mapped", __func__); |
291 | } | | 291 | } |
292 | if (srcpde[idx] & PG_PS) { | | 292 | if (srcpde[idx] & PG_PS) { |
293 | pa = srcpde[idx] & PG_2MFRAME; | | 293 | pa = srcpde[idx] & PG_2MFRAME; |
294 | pa += (paddr_t)(va % NBPD_L2); | | 294 | pa += (paddr_t)(va % NBPD_L2); |
295 | pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa; | | 295 | pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa; |
296 | | | 296 | |
297 | if (pmap_valid_entry(dstpde[pidx])) { | | 297 | if (pmap_valid_entry(dstpde[pidx])) { |
298 | panic("%s: L1 page already mapped", __func__); | | 298 | panic("%s: L1 page already mapped", __func__); |
299 | } | | 299 | } |
300 | dstpde[pidx] = pde; | | 300 | dstpde[pidx] = pde; |
301 | return; | | 301 | return; |
302 | } | | 302 | } |
303 | | | 303 | |
304 | /* | | 304 | /* |
305 | * Normal page, just copy the PDE. | | 305 | * Normal page, just copy the PDE. |
306 | */ | | 306 | */ |
307 | idx = pl1_i(va); | | 307 | idx = pl1_i(va); |
308 | srcpde = L1_BASE; | | 308 | srcpde = L1_BASE; |
309 | if (!pmap_valid_entry(srcpde[idx])) { | | 309 | if (!pmap_valid_entry(srcpde[idx])) { |
310 | panic("%s: L1 page not mapped", __func__); | | 310 | panic("%s: L1 page not mapped", __func__); |
311 | } | | 311 | } |
312 | if (pmap_valid_entry(dstpde[pidx])) { | | 312 | if (pmap_valid_entry(dstpde[pidx])) { |
313 | panic("%s: L1 page already mapped", __func__); | | 313 | panic("%s: L1 page already mapped", __func__); |
314 | } | | 314 | } |
315 | dstpde[pidx] = srcpde[idx] & ~(PG_G); | | 315 | dstpde[pidx] = srcpde[idx] & ~(PG_G); |
316 | } | | 316 | } |
317 | | | 317 | |
318 | static void | | 318 | static void |
319 | svs_rsp0_init(struct cpu_info *ci) | | 319 | svs_rsp0_init(struct cpu_info *ci) |
320 | { | | 320 | { |
321 | const cpuid_t cid = cpu_index(ci); | | 321 | const cpuid_t cid = cpu_index(ci); |
322 | vaddr_t va, rsp0; | | 322 | vaddr_t va, rsp0; |
323 | pd_entry_t *pd; | | 323 | pd_entry_t *pd; |
324 | size_t pidx; | | 324 | size_t pidx; |
325 | | | 325 | |
326 | rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0; | | 326 | rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0; |
327 | | | 327 | |
328 | /* The first page is a redzone. */ | | 328 | /* The first page is a redzone. */ |
329 | va = rsp0 + PAGE_SIZE; | | 329 | va = rsp0 + PAGE_SIZE; |
330 | | | 330 | |
331 | /* Create levels L4, L3 and L2. */ | | 331 | /* Create levels L4, L3 and L2. */ |
332 | pd = svs_tree_add(ci, va); | | 332 | pd = svs_tree_add(ci, va); |
333 | | | 333 | |
334 | /* Get the info for L1. */ | | 334 | /* Get the info for L1. */ |
335 | pidx = pl1_i(va % NBPD_L2); | | 335 | pidx = pl1_i(va % NBPD_L2); |
336 | if (pmap_valid_entry(pd[pidx])) { | | 336 | if (pmap_valid_entry(pd[pidx])) { |
337 | panic("%s: rsp0 page already mapped", __func__); | | 337 | panic("%s: rsp0 page already mapped", __func__); |
338 | } | | 338 | } |
339 | | | 339 | |
340 | ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx]; | | 340 | ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx]; |
341 | ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe); | | 341 | ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe); |
342 | ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe); | | 342 | ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe); |
343 | ci->ci_svs_krsp0 = 0; | | 343 | ci->ci_svs_krsp0 = 0; |
344 | } | | 344 | } |
345 | | | 345 | |
346 | static void | | 346 | static void |
347 | svs_utls_init(struct cpu_info *ci) | | 347 | svs_utls_init(struct cpu_info *ci) |
348 | { | | 348 | { |
349 | const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls; | | 349 | const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls; |
350 | struct svs_utls *utls; | | 350 | struct svs_utls *utls; |
351 | struct vm_page *pg; | | 351 | struct vm_page *pg; |
352 | pd_entry_t *pd; | | 352 | pd_entry_t *pd; |
353 | size_t pidx; | | 353 | size_t pidx; |
354 | paddr_t pa; | | 354 | paddr_t pa; |
355 | vaddr_t va; | | 355 | vaddr_t va; |
356 | | | 356 | |
357 | /* Create levels L4, L3 and L2 of the UTLS page. */ | | 357 | /* Create levels L4, L3 and L2 of the UTLS page. */ |
358 | pd = svs_tree_add(ci, utlsva); | | 358 | pd = svs_tree_add(ci, utlsva); |
359 | | | 359 | |
360 | /* Allocate L1. */ | | 360 | /* Allocate L1. */ |
361 | pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); | | 361 | pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); |
362 | if (pg == 0) | | 362 | if (pg == 0) |
363 | panic("%s: failed to allocate PA for CPU %d\n", __func__, | | 363 | panic("%s: failed to allocate PA for CPU %d\n", __func__, |
364 | cpu_index(ci)); | | 364 | cpu_index(ci)); |
365 | pa = VM_PAGE_TO_PHYS(pg); | | 365 | pa = VM_PAGE_TO_PHYS(pg); |
366 | | | 366 | |
367 | /* Enter L1. */ | | 367 | /* Enter L1. */ |
368 | if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) { | | 368 | if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) { |
369 | panic("%s: local page already mapped", __func__); | | 369 | panic("%s: local page already mapped", __func__); |
370 | } | | 370 | } |
371 | pidx = pl1_i(utlsva % NBPD_L2); | | 371 | pidx = pl1_i(utlsva % NBPD_L2); |
372 | if (pmap_valid_entry(pd[pidx])) { | | 372 | if (pmap_valid_entry(pd[pidx])) { |
373 | panic("%s: L1 page already mapped", __func__); | | 373 | panic("%s: L1 page already mapped", __func__); |
374 | } | | 374 | } |
375 | pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa; | | 375 | pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa; |
376 | | | 376 | |
377 | /* | | 377 | /* |
378 | * Now, allocate a VA in the kernel map, that points to the UTLS | | 378 | * Now, allocate a VA in the kernel map, that points to the UTLS |
379 | * page. After that, the UTLS page will be accessible in kernel | | 379 | * page. After that, the UTLS page will be accessible in kernel |
380 | * mode via ci_svs_utls. | | 380 | * mode via ci_svs_utls. |
381 | */ | | 381 | */ |
382 | va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, | | 382 | va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, |
383 | UVM_KMF_VAONLY|UVM_KMF_NOWAIT); | | 383 | UVM_KMF_VAONLY|UVM_KMF_NOWAIT); |
384 | if (va == 0) { | | 384 | if (va == 0) { |
385 | panic("%s: unable to allocate VA\n", __func__); | | 385 | panic("%s: unable to allocate VA\n", __func__); |
386 | } | | 386 | } |
387 | pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0); | | 387 | pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0); |
388 | pmap_update(pmap_kernel()); | | 388 | pmap_update(pmap_kernel()); |
389 | | | 389 | |
390 | ci->ci_svs_utls = va; | | 390 | ci->ci_svs_utls = va; |
391 | | | 391 | |
392 | /* Initialize the constant fields of the UTLS page */ | | 392 | /* Initialize the constant fields of the UTLS page */ |
393 | utls = (struct svs_utls *)ci->ci_svs_utls; | | 393 | utls = (struct svs_utls *)ci->ci_svs_utls; |
394 | utls->rsp0 = ci->ci_svs_rsp0; | | 394 | utls->rsp0 = ci->ci_svs_rsp0; |
395 | } | | 395 | } |
396 | | | 396 | |
397 | static void | | 397 | static void |
398 | svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size) | | 398 | svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size) |
399 | { | | 399 | { |
400 | size_t i, n; | | 400 | size_t i, n; |
401 | | | 401 | |
402 | KASSERT(size % PAGE_SIZE == 0); | | 402 | KASSERT(size % PAGE_SIZE == 0); |
403 | n = size / PAGE_SIZE; | | 403 | n = size / PAGE_SIZE; |
404 | for (i = 0; i < n; i++) { | | 404 | for (i = 0; i < n; i++) { |
405 | svs_page_add(ci, va + i * PAGE_SIZE); | | 405 | svs_page_add(ci, va + i * PAGE_SIZE); |
406 | } | | 406 | } |
407 | } | | 407 | } |
408 | | | 408 | |
409 | void | | 409 | void |
410 | cpu_svs_init(struct cpu_info *ci) | | 410 | cpu_svs_init(struct cpu_info *ci) |
411 | { | | 411 | { |
412 | extern char __text_user_start; | | 412 | extern char __text_user_start; |
413 | extern char __text_user_end; | | 413 | extern char __text_user_end; |
414 | const cpuid_t cid = cpu_index(ci); | | 414 | const cpuid_t cid = cpu_index(ci); |
415 | struct vm_page *pg; | | 415 | struct vm_page *pg; |
416 | | | 416 | |
417 | KASSERT(ci != NULL); | | 417 | KASSERT(ci != NULL); |
418 | | | 418 | |
419 | pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); | | 419 | pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); |
420 | if (pg == 0) | | 420 | if (pg == 0) |
421 | panic("%s: failed to allocate L4 PA for CPU %d\n", | | 421 | panic("%s: failed to allocate L4 PA for CPU %d\n", |
422 | __func__, cpu_index(ci)); | | 422 | __func__, cpu_index(ci)); |
423 | ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg); | | 423 | ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg); |
424 | | | 424 | |
425 | ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, | | 425 | ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, |
426 | UVM_KMF_VAONLY | UVM_KMF_NOWAIT); | | 426 | UVM_KMF_VAONLY | UVM_KMF_NOWAIT); |
427 | if (ci->ci_svs_updir == NULL) | | 427 | if (ci->ci_svs_updir == NULL) |
428 | panic("%s: failed to allocate L4 VA for CPU %d\n", | | 428 | panic("%s: failed to allocate L4 VA for CPU %d\n", |
429 | __func__, cpu_index(ci)); | | 429 | __func__, cpu_index(ci)); |
430 | | | 430 | |
431 | pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa, | | 431 | pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa, |
432 | VM_PROT_READ | VM_PROT_WRITE, 0); | | 432 | VM_PROT_READ | VM_PROT_WRITE, 0); |
433 | | | 433 | |
434 | pmap_update(pmap_kernel()); | | 434 | pmap_update(pmap_kernel()); |
435 | | | 435 | |
436 | ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0); | | 436 | ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0); |
437 | | | 437 | |
438 | mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM); | | 438 | mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM); |
439 | | | 439 | |
440 | svs_page_add(ci, (vaddr_t)&pcpuarea->idt); | | 440 | svs_page_add(ci, (vaddr_t)&pcpuarea->idt); |
441 | svs_page_add(ci, (vaddr_t)&pcpuarea->ldt); | | 441 | svs_page_add(ci, (vaddr_t)&pcpuarea->ldt); |
442 | svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid], | | 442 | svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid], |
443 | offsetof(struct pcpu_entry, rsp0)); | | 443 | offsetof(struct pcpu_entry, rsp0)); |
444 | svs_range_add(ci, (vaddr_t)&__text_user_start, | | 444 | svs_range_add(ci, (vaddr_t)&__text_user_start, |
445 | (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start); | | 445 | (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start); |
446 | | | 446 | |
447 | svs_rsp0_init(ci); | | 447 | svs_rsp0_init(ci); |
448 | svs_utls_init(ci); | | 448 | svs_utls_init(ci); |
449 | } | | 449 | } |
450 | | | 450 | |
451 | void | | 451 | void |
452 | svs_pmap_sync(struct pmap *pmap, int index) | | 452 | svs_pmap_sync(struct pmap *pmap, int index) |
453 | { | | 453 | { |
454 | CPU_INFO_ITERATOR cii; | | 454 | CPU_INFO_ITERATOR cii; |
455 | struct cpu_info *ci; | | 455 | struct cpu_info *ci; |
456 | cpuid_t cid; | | 456 | cpuid_t cid; |
457 | | | 457 | |
458 | KASSERT(pmap != NULL); | | 458 | KASSERT(pmap != NULL); |
459 | KASSERT(pmap != pmap_kernel()); | | 459 | KASSERT(pmap != pmap_kernel()); |
460 | KASSERT(mutex_owned(pmap->pm_lock)); | | 460 | KASSERT(mutex_owned(pmap->pm_lock)); |
461 | KASSERT(kpreempt_disabled()); | | 461 | KASSERT(kpreempt_disabled()); |
462 | KASSERT(index < 255); | | 462 | KASSERT(index < 255); |
463 | | | 463 | |
464 | for (CPU_INFO_FOREACH(cii, ci)) { | | 464 | for (CPU_INFO_FOREACH(cii, ci)) { |
465 | cid = cpu_index(ci); | | 465 | cid = cpu_index(ci); |
466 | | | 466 | |
467 | if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) { | | 467 | if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) { |
468 | continue; | | 468 | continue; |
469 | } | | 469 | } |
470 | | | 470 | |
471 | /* take the lock and check again */ | | 471 | /* take the lock and check again */ |
472 | mutex_enter(&ci->ci_svs_mtx); | | 472 | mutex_enter(&ci->ci_svs_mtx); |
473 | if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) { | | 473 | if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) { |
474 | ci->ci_svs_updir[index] = pmap->pm_pdir[index]; | | 474 | ci->ci_svs_updir[index] = pmap->pm_pdir[index]; |
475 | } | | 475 | } |
476 | mutex_exit(&ci->ci_svs_mtx); | | 476 | mutex_exit(&ci->ci_svs_mtx); |
477 | } | | 477 | } |
478 | } | | 478 | } |
479 | | | 479 | |
480 | void | | 480 | void |
481 | svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp) | | 481 | svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp) |
482 | { | | 482 | { |
483 | struct cpu_info *ci = curcpu(); | | 483 | struct cpu_info *ci = curcpu(); |
484 | struct svs_utls *utls; | | 484 | struct svs_utls *utls; |
485 | struct pcb *pcb; | | 485 | struct pcb *pcb; |
486 | pt_entry_t *pte; | | 486 | pt_entry_t *pte; |
487 | uintptr_t rsp0; | | 487 | uintptr_t rsp0; |
488 | vaddr_t va; | | 488 | vaddr_t va; |
489 | | | 489 | |
490 | if (newlwp->l_flag & LW_SYSTEM) { | | 490 | if (newlwp->l_flag & LW_SYSTEM) { |
491 | return; | | 491 | return; |
492 | } | | 492 | } |
493 | | | 493 | |
494 | #ifdef DIAGNOSTIC | | 494 | #ifdef DIAGNOSTIC |
495 | if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) { | | 495 | if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) { |
496 | pcb = lwp_getpcb(oldlwp); | | 496 | pcb = lwp_getpcb(oldlwp); |
497 | rsp0 = pcb->pcb_rsp0; | | 497 | rsp0 = pcb->pcb_rsp0; |
498 | va = rounddown(rsp0, PAGE_SIZE); | | 498 | va = rounddown(rsp0, PAGE_SIZE); |
499 | KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe)); | | 499 | KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe)); |
500 | pte = ci->ci_svs_rsp0_pte; | | 500 | pte = ci->ci_svs_rsp0_pte; |
501 | KASSERT(*pte == L1_BASE[pl1_i(va)]); | | 501 | KASSERT(*pte == L1_BASE[pl1_i(va)]); |
502 | } | | 502 | } |
503 | #endif | | 503 | #endif |
504 | | | 504 | |
505 | pcb = lwp_getpcb(newlwp); | | 505 | pcb = lwp_getpcb(newlwp); |
506 | rsp0 = pcb->pcb_rsp0; | | 506 | rsp0 = pcb->pcb_rsp0; |
507 | va = rounddown(rsp0, PAGE_SIZE); | | 507 | va = rounddown(rsp0, PAGE_SIZE); |
508 | | | 508 | |
509 | /* Update the kernel rsp0 in cpu_info */ | | 509 | /* Update the kernel rsp0 in cpu_info */ |
510 | ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe); | | 510 | ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe); |
511 | KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) == | | 511 | KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) == |
512 | (ci->ci_svs_ursp0 % PAGE_SIZE)); | | 512 | (ci->ci_svs_ursp0 % PAGE_SIZE)); |
513 | | | 513 | |
514 | utls = (struct svs_utls *)ci->ci_svs_utls; | | 514 | utls = (struct svs_utls *)ci->ci_svs_utls; |
515 | utls->scratch = 0; | | 515 | utls->scratch = 0; |
516 | | | 516 | |
517 | /* | | 517 | /* |
518 | * Enter the user rsp0. We don't need to flush the TLB here, since | | 518 | * Enter the user rsp0. We don't need to flush the TLB here, since |
519 | * the user page tables are not loaded. | | 519 | * the user page tables are not loaded. |
520 | */ | | 520 | */ |
521 | pte = ci->ci_svs_rsp0_pte; | | 521 | pte = ci->ci_svs_rsp0_pte; |
522 | *pte = L1_BASE[pl1_i(va)]; | | 522 | *pte = L1_BASE[pl1_i(va)]; |
523 | } | | 523 | } |
524 | | | 524 | |
525 | static inline pt_entry_t | | 525 | static inline pt_entry_t |
526 | svs_pte_atomic_read(struct pmap *pmap, size_t idx) | | 526 | svs_pte_atomic_read(struct pmap *pmap, size_t idx) |
527 | { | | 527 | { |
528 | /* | | 528 | /* |
529 | * XXX: We don't have a basic atomic_fetch_64 function? | | 529 | * XXX: We don't have a basic atomic_fetch_64 function? |
530 | */ | | 530 | */ |
531 | return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666); | | 531 | return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666); |
532 | } | | 532 | } |
533 | | | 533 | |
534 | /* | | 534 | /* |
535 | * We may come here with the pmap unlocked. So read its PTEs atomically. If | | 535 | * We may come here with the pmap unlocked. So read its PTEs atomically. If |
536 | * a remote CPU is updating them at the same time, it's not a problem: the | | 536 | * a remote CPU is updating them at the same time, it's not a problem: the |
537 | * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be | | 537 | * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be |
538 | * synchronized properly. | | 538 | * synchronized properly. |
539 | */ | | 539 | */ |
540 | void | | 540 | void |
541 | svs_pdir_switch(struct pmap *pmap) | | 541 | svs_pdir_switch(struct pmap *pmap) |
542 | { | | 542 | { |
543 | struct cpu_info *ci = curcpu(); | | 543 | struct cpu_info *ci = curcpu(); |
544 | struct svs_utls *utls; | | 544 | struct svs_utls *utls; |
545 | pt_entry_t pte; | | 545 | pt_entry_t pte; |
546 | size_t i; | | 546 | size_t i; |
547 | | | 547 | |
548 | KASSERT(kpreempt_disabled()); | | 548 | KASSERT(kpreempt_disabled()); |
549 | KASSERT(pmap != pmap_kernel()); | | 549 | KASSERT(pmap != pmap_kernel()); |
550 | | | 550 | |
551 | ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0); | | 551 | ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0); |
552 | | | 552 | |
553 | /* Update the info in the UTLS page */ | | 553 | /* Update the info in the UTLS page */ |
554 | utls = (struct svs_utls *)ci->ci_svs_utls; | | 554 | utls = (struct svs_utls *)ci->ci_svs_utls; |
555 | utls->kpdirpa = ci->ci_svs_kpdirpa; | | 555 | utls->kpdirpa = ci->ci_svs_kpdirpa; |
556 | | | 556 | |
557 | mutex_enter(&ci->ci_svs_mtx); | | 557 | mutex_enter(&ci->ci_svs_mtx); |
558 | | | 558 | |
559 | /* User slots. */ | | 559 | /* User slots. */ |
560 | for (i = 0; i < 255; i++) { | | 560 | for (i = 0; i < 255; i++) { |
561 | pte = svs_pte_atomic_read(pmap, i); | | 561 | pte = svs_pte_atomic_read(pmap, i); |
562 | ci->ci_svs_updir[i] = pte; | | 562 | ci->ci_svs_updir[i] = pte; |
563 | } | | 563 | } |
564 | | | 564 | |
565 | mutex_exit(&ci->ci_svs_mtx); | | 565 | mutex_exit(&ci->ci_svs_mtx); |
566 | } | | 566 | } |
567 | | | 567 | |
568 | static void | | 568 | static void |
569 | svs_enable(void) | | 569 | svs_enable(void) |
570 | { | | 570 | { |
571 | extern uint8_t svs_enter, svs_enter_end; | | 571 | extern uint8_t svs_enter, svs_enter_end; |
572 | extern uint8_t svs_enter_altstack, svs_enter_altstack_end; | | 572 | extern uint8_t svs_enter_altstack, svs_enter_altstack_end; |
573 | extern uint8_t svs_leave, svs_leave_end; | | 573 | extern uint8_t svs_leave, svs_leave_end; |
574 | extern uint8_t svs_leave_altstack, svs_leave_altstack_end; | | 574 | extern uint8_t svs_leave_altstack, svs_leave_altstack_end; |
575 | u_long psl, cr0; | | 575 | u_long psl, cr0; |
576 | uint8_t *bytes; | | 576 | uint8_t *bytes; |
577 | size_t size; | | 577 | size_t size; |
578 | | | 578 | |
579 | svs_enabled = true; | | 579 | svs_enabled = true; |
580 | | | 580 | |
581 | x86_patch_window_open(&psl, &cr0); | | 581 | x86_patch_window_open(&psl, &cr0); |
582 | | | 582 | |
583 | bytes = &svs_enter; | | 583 | bytes = &svs_enter; |
584 | size = (size_t)&svs_enter_end - (size_t)&svs_enter; | | 584 | size = (size_t)&svs_enter_end - (size_t)&svs_enter; |
585 | x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); | | 585 | x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); |
586 | | | 586 | |
587 | bytes = &svs_enter_altstack; | | 587 | bytes = &svs_enter_altstack; |
588 | size = (size_t)&svs_enter_altstack_end - | | 588 | size = (size_t)&svs_enter_altstack_end - |
589 | (size_t)&svs_enter_altstack; | | 589 | (size_t)&svs_enter_altstack; |
590 | x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); | | 590 | x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); |
591 | | | 591 | |
592 | bytes = &svs_leave; | | 592 | bytes = &svs_leave; |
593 | size = (size_t)&svs_leave_end - (size_t)&svs_leave; | | 593 | size = (size_t)&svs_leave_end - (size_t)&svs_leave; |
594 | x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); | | 594 | x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); |
595 | | | 595 | |
596 | bytes = &svs_leave_altstack; | | 596 | bytes = &svs_leave_altstack; |
597 | size = (size_t)&svs_leave_altstack_end - | | 597 | size = (size_t)&svs_leave_altstack_end - |
598 | (size_t)&svs_leave_altstack; | | 598 | (size_t)&svs_leave_altstack; |
599 | x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); | | 599 | x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); |
600 | | | 600 | |
601 | x86_patch_window_close(psl, cr0); | | 601 | x86_patch_window_close(psl, cr0); |
602 | } | | 602 | } |
603 | | | 603 | |
604 | static void | | 604 | static void |
605 | svs_disable_hotpatch(void) | | 605 | svs_disable_hotpatch(void) |
606 | { | | 606 | { |
607 | extern uint8_t nosvs_enter, nosvs_enter_end; | | 607 | extern uint8_t nosvs_enter, nosvs_enter_end; |
608 | extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end; | | 608 | extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end; |
609 | extern uint8_t nosvs_leave, nosvs_leave_end; | | 609 | extern uint8_t nosvs_leave, nosvs_leave_end; |
610 | extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end; | | 610 | extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end; |
611 | u_long psl, cr0; | | 611 | u_long psl, cr0; |
612 | uint8_t *bytes; | | 612 | uint8_t *bytes; |
613 | size_t size; | | 613 | size_t size; |
614 | | | 614 | |
615 | x86_patch_window_open(&psl, &cr0); | | 615 | x86_patch_window_open(&psl, &cr0); |
616 | | | 616 | |
617 | bytes = &nosvs_enter; | | 617 | bytes = &nosvs_enter; |
618 | size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter; | | 618 | size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter; |
619 | x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); | | 619 | x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); |
620 | | | 620 | |
621 | bytes = &nosvs_enter_altstack; | | 621 | bytes = &nosvs_enter_altstack; |
622 | size = (size_t)&nosvs_enter_altstack_end - | | 622 | size = (size_t)&nosvs_enter_altstack_end - |
623 | (size_t)&nosvs_enter_altstack; | | 623 | (size_t)&nosvs_enter_altstack; |
624 | x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); | | 624 | x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); |
625 | | | 625 | |
626 | bytes = &nosvs_leave; | | 626 | bytes = &nosvs_leave; |
627 | size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave; | | 627 | size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave; |
628 | x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); | | 628 | x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); |
629 | | | 629 | |
630 | bytes = &nosvs_leave_altstack; | | 630 | bytes = &nosvs_leave_altstack; |
631 | size = (size_t)&nosvs_leave_altstack_end - | | 631 | size = (size_t)&nosvs_leave_altstack_end - |
632 | (size_t)&nosvs_leave_altstack; | | 632 | (size_t)&nosvs_leave_altstack; |
633 | x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); | | 633 | x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); |
634 | | | 634 | |
635 | x86_patch_window_close(psl, cr0); | | 635 | x86_patch_window_close(psl, cr0); |
636 | } | | 636 | } |
637 | | | 637 | |
638 | static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned; | | 638 | static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned; |
639 | static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned; | | 639 | static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned; |
640 | typedef void (vector)(void); | | 640 | typedef void (vector)(void); |
641 | | | 641 | |
642 | static void | | 642 | static void |
643 | svs_disable_cpu(void *arg1, void *arg2) | | 643 | svs_disable_cpu(void *arg1, void *arg2) |
644 | { | | 644 | { |
645 | struct cpu_info *ci = curcpu(); | | 645 | struct cpu_info *ci = curcpu(); |
646 | extern vector Xsyscall; | | 646 | extern vector Xsyscall; |
647 | u_long psl; | | 647 | u_long psl; |
648 | | | 648 | |
649 | psl = x86_read_psl(); | | 649 | psl = x86_read_psl(); |
| | | 650 | x86_disable_intr(); |
650 | | | 651 | |
651 | atomic_dec_ulong(&svs_cpu_barrier1); | | 652 | atomic_dec_ulong(&svs_cpu_barrier1); |
652 | while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) { | | 653 | while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) { |
653 | x86_pause(); | | 654 | x86_pause(); |
654 | } | | 655 | } |
655 | | | 656 | |
656 | /* cpu0 is the one that does the hotpatch job */ | | 657 | /* cpu0 is the one that does the hotpatch job */ |
657 | if (ci == &cpu_info_primary) { | | 658 | if (ci == &cpu_info_primary) { |
658 | svs_enabled = false; | | 659 | svs_enabled = false; |
659 | svs_disable_hotpatch(); | | 660 | svs_disable_hotpatch(); |
660 | } | | 661 | } |
661 | | | 662 | |
662 | /* put back the non-SVS syscall entry point */ | | 663 | /* put back the non-SVS syscall entry point */ |
663 | wrmsr(MSR_LSTAR, (uint64_t)Xsyscall); | | 664 | wrmsr(MSR_LSTAR, (uint64_t)Xsyscall); |
664 | | | 665 | |
665 | /* enable global pages */ | | 666 | /* enable global pages */ |
666 | if (cpu_feature[0] & CPUID_PGE) | | 667 | if (cpu_feature[0] & CPUID_PGE) |
667 | lcr4(rcr4() | CR4_PGE); | | 668 | lcr4(rcr4() | CR4_PGE); |
668 | | | 669 | |
669 | atomic_dec_ulong(&svs_cpu_barrier2); | | 670 | atomic_dec_ulong(&svs_cpu_barrier2); |
670 | while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) { | | 671 | while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) { |
671 | x86_pause(); | | 672 | x86_pause(); |
672 | } | | 673 | } |
673 | | | 674 | |
674 | /* Write back and invalidate cache, flush pipelines. */ | | 675 | /* Write back and invalidate cache, flush pipelines. */ |
675 | wbinvd(); | | 676 | wbinvd(); |
676 | x86_flush(); | | 677 | x86_flush(); |
677 | | | 678 | |
678 | x86_write_psl(psl); | | 679 | x86_write_psl(psl); |
679 | } | | 680 | } |
680 | | | 681 | |
681 | static int | | 682 | static int |
682 | svs_disable(void) | | 683 | svs_disable(void) |
683 | { | | 684 | { |
684 | struct cpu_info *ci = NULL; | | 685 | struct cpu_info *ci = NULL; |
685 | CPU_INFO_ITERATOR cii; | | 686 | CPU_INFO_ITERATOR cii; |
686 | uint64_t xc; | | 687 | uint64_t xc; |
687 | | | 688 | |
688 | mutex_enter(&cpu_lock); | | 689 | mutex_enter(&cpu_lock); |
689 | | | 690 | |
690 | /* | | 691 | /* |
691 | * We expect all the CPUs to be online. | | 692 | * We expect all the CPUs to be online. |
692 | */ | | 693 | */ |
693 | for (CPU_INFO_FOREACH(cii, ci)) { | | 694 | for (CPU_INFO_FOREACH(cii, ci)) { |
694 | struct schedstate_percpu *spc = &ci->ci_schedstate; | | 695 | struct schedstate_percpu *spc = &ci->ci_schedstate; |
695 | if (spc->spc_flags & SPCF_OFFLINE) { | | 696 | if (spc->spc_flags & SPCF_OFFLINE) { |
696 | printf("[!] cpu%d offline, SVS not disabled\n", | | 697 | printf("[!] cpu%d offline, SVS not disabled\n", |
697 | cpu_index(ci)); | | 698 | cpu_index(ci)); |
698 | mutex_exit(&cpu_lock); | | 699 | mutex_exit(&cpu_lock); |
699 | return EOPNOTSUPP; | | 700 | return EOPNOTSUPP; |
700 | } | | 701 | } |
701 | } | | 702 | } |
702 | | | 703 | |
703 | svs_cpu_barrier1 = ncpu; | | 704 | svs_cpu_barrier1 = ncpu; |
704 | svs_cpu_barrier2 = ncpu; | | 705 | svs_cpu_barrier2 = ncpu; |
705 | | | 706 | |
706 | printf("[+] Disabling SVS..."); | | 707 | printf("[+] Disabling SVS..."); |
707 | xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL); | | 708 | xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL); |
708 | xc_wait(xc); | | 709 | xc_wait(xc); |
709 | printf(" done!\n"); | | 710 | printf(" done!\n"); |
710 | | | 711 | |
711 | mutex_exit(&cpu_lock); | | 712 | mutex_exit(&cpu_lock); |
712 | | | 713 | |
713 | return 0; | | 714 | return 0; |
714 | } | | 715 | } |
715 | | | 716 | |
716 | int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS); | | 717 | int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS); |
717 | | | 718 | |
718 | int | | 719 | int |
719 | sysctl_machdep_svs_enabled(SYSCTLFN_ARGS) | | 720 | sysctl_machdep_svs_enabled(SYSCTLFN_ARGS) |
720 | { | | 721 | { |
721 | struct sysctlnode node; | | 722 | struct sysctlnode node; |
722 | int error, val; | | 723 | int error, val; |
723 | | | 724 | |
724 | val = *(int *)rnode->sysctl_data; | | 725 | val = *(int *)rnode->sysctl_data; |
725 | | | 726 | |
726 | node = *rnode; | | 727 | node = *rnode; |
727 | node.sysctl_data = &val; | | 728 | node.sysctl_data = &val; |
728 | | | 729 | |
729 | error = sysctl_lookup(SYSCTLFN_CALL(&node)); | | 730 | error = sysctl_lookup(SYSCTLFN_CALL(&node)); |
730 | if (error != 0 || newp == NULL) | | 731 | if (error != 0 || newp == NULL) |
731 | return error; | | 732 | return error; |
732 | | | 733 | |
733 | if (val == 1) { | | 734 | if (val == 1) { |
734 | error = EINVAL; | | 735 | error = EINVAL; |
735 | } else { | | 736 | } else { |
736 | if (svs_enabled) | | 737 | if (svs_enabled) |
737 | error = svs_disable(); | | 738 | error = svs_disable(); |
738 | else | | 739 | else |
739 | error = 0; | | 740 | error = 0; |
740 | } | | 741 | } |
741 | | | 742 | |
742 | return error; | | 743 | return error; |
743 | } | | 744 | } |
744 | | | 745 | |
745 | void | | 746 | void |
746 | svs_init(void) | | 747 | svs_init(void) |
747 | { | | 748 | { |
748 | if (cpu_vendor != CPUVENDOR_INTEL) { | | 749 | if (cpu_vendor != CPUVENDOR_INTEL) { |
749 | return; | | 750 | return; |
750 | } | | 751 | } |
751 | svs_enable(); | | 752 | svs_enable(); |
752 | } | | 753 | } |