Wed Apr 27 07:42:11 2011 UTC ()
drop 'inline' here, to avoid C99 vs GNU differences


(plunky)
diff -r1.119 -r1.120 src/sys/arch/x86/x86/pmap.c

cvs diff -r1.119 -r1.120 src/sys/arch/x86/x86/pmap.c (switch to unified diff)

--- src/sys/arch/x86/x86/pmap.c 2011/04/14 16:00:21 1.119
+++ src/sys/arch/x86/x86/pmap.c 2011/04/27 07:42:11 1.120
@@ -1,1769 +1,1769 @@ @@ -1,1769 +1,1769 @@
1/* $NetBSD: pmap.c,v 1.119 2011/04/14 16:00:21 yamt Exp $ */ 1/* $NetBSD: pmap.c,v 1.120 2011/04/27 07:42:11 plunky Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2007 Manuel Bouyer. 4 * Copyright (c) 2007 Manuel Bouyer.
5 * 5 *
6 * Redistribution and use in source and binary forms, with or without 6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions 7 * modification, are permitted provided that the following conditions
8 * are met: 8 * are met:
9 * 1. Redistributions of source code must retain the above copyright 9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer. 10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright 11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the 12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution. 13 * documentation and/or other materials provided with the distribution.
14 * 14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 * 25 *
26 */ 26 */
27 27
28/* 28/*
29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30 * 30 *
31 * Permission to use, copy, modify, and distribute this software for any 31 * Permission to use, copy, modify, and distribute this software for any
32 * purpose with or without fee is hereby granted, provided that the above 32 * purpose with or without fee is hereby granted, provided that the above
33 * copyright notice and this permission notice appear in all copies. 33 * copyright notice and this permission notice appear in all copies.
34 * 34 *
35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42 */ 42 */
43 43
44/* 44/*
45 * Copyright (c) 1997 Charles D. Cranor and Washington University. 45 * Copyright (c) 1997 Charles D. Cranor and Washington University.
46 * All rights reserved. 46 * All rights reserved.
47 * 47 *
48 * Redistribution and use in source and binary forms, with or without 48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions 49 * modification, are permitted provided that the following conditions
50 * are met: 50 * are met:
51 * 1. Redistributions of source code must retain the above copyright 51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer. 52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright 53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the 54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution. 55 * documentation and/or other materials provided with the distribution.
56 * 56 *
57 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 57 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
58 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 58 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
59 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 59 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
60 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 60 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
61 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 61 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
62 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 62 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
63 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 63 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
64 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 65 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
66 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67 */ 67 */
68 68
69/* 69/*
70 * Copyright 2001 (c) Wasabi Systems, Inc. 70 * Copyright 2001 (c) Wasabi Systems, Inc.
71 * All rights reserved. 71 * All rights reserved.
72 * 72 *
73 * Written by Frank van der Linden for Wasabi Systems, Inc. 73 * Written by Frank van der Linden for Wasabi Systems, Inc.
74 * 74 *
75 * Redistribution and use in source and binary forms, with or without 75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions 76 * modification, are permitted provided that the following conditions
77 * are met: 77 * are met:
78 * 1. Redistributions of source code must retain the above copyright 78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer. 79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright 80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the 81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution. 82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software 83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement: 84 * must display the following acknowledgement:
85 * This product includes software developed for the NetBSD Project by 85 * This product includes software developed for the NetBSD Project by
86 * Wasabi Systems, Inc. 86 * Wasabi Systems, Inc.
87 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 87 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
88 * or promote products derived from this software without specific prior 88 * or promote products derived from this software without specific prior
89 * written permission. 89 * written permission.
90 * 90 *
91 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 91 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
93 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 93 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 94 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
95 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 95 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
96 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 96 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
97 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 97 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
98 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 98 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
99 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 99 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
100 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 100 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
101 * POSSIBILITY OF SUCH DAMAGE. 101 * POSSIBILITY OF SUCH DAMAGE.
102 */ 102 */
103 103
104/* 104/*
105 * This is the i386 pmap modified and generalized to support x86-64 105 * This is the i386 pmap modified and generalized to support x86-64
106 * as well. The idea is to hide the upper N levels of the page tables 106 * as well. The idea is to hide the upper N levels of the page tables
107 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 107 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
108 * is mostly untouched, except that it uses some more generalized 108 * is mostly untouched, except that it uses some more generalized
109 * macros and interfaces. 109 * macros and interfaces.
110 * 110 *
111 * This pmap has been tested on the i386 as well, and it can be easily 111 * This pmap has been tested on the i386 as well, and it can be easily
112 * adapted to PAE. 112 * adapted to PAE.
113 * 113 *
114 * fvdl@wasabisystems.com 18-Jun-2001 114 * fvdl@wasabisystems.com 18-Jun-2001
115 */ 115 */
116 116
117/* 117/*
118 * pmap.c: i386 pmap module rewrite 118 * pmap.c: i386 pmap module rewrite
119 * Chuck Cranor <chuck@netbsd> 119 * Chuck Cranor <chuck@netbsd>
120 * 11-Aug-97 120 * 11-Aug-97
121 * 121 *
122 * history of this pmap module: in addition to my own input, i used 122 * history of this pmap module: in addition to my own input, i used
123 * the following references for this rewrite of the i386 pmap: 123 * the following references for this rewrite of the i386 pmap:
124 * 124 *
125 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 125 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
126 * BSD hp300 pmap done by Mike Hibler at University of Utah. 126 * BSD hp300 pmap done by Mike Hibler at University of Utah.
127 * it was then ported to the i386 by William Jolitz of UUNET 127 * it was then ported to the i386 by William Jolitz of UUNET
128 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 128 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
129 * project fixed some bugs and provided some speed ups. 129 * project fixed some bugs and provided some speed ups.
130 * 130 *
131 * [2] the FreeBSD i386 pmap. this pmap seems to be the 131 * [2] the FreeBSD i386 pmap. this pmap seems to be the
132 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 132 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
133 * and David Greenman. 133 * and David Greenman.
134 * 134 *
135 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 135 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
136 * between several processors. the VAX version was done by 136 * between several processors. the VAX version was done by
137 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 137 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
138 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 138 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
139 * David Golub, and Richard Draves. the alpha version was 139 * David Golub, and Richard Draves. the alpha version was
140 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 140 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
141 * (NetBSD/alpha). 141 * (NetBSD/alpha).
142 */ 142 */
143 143
144#include <sys/cdefs.h> 144#include <sys/cdefs.h>
145__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.119 2011/04/14 16:00:21 yamt Exp $"); 145__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.120 2011/04/27 07:42:11 plunky Exp $");
146 146
147#include "opt_user_ldt.h" 147#include "opt_user_ldt.h"
148#include "opt_lockdebug.h" 148#include "opt_lockdebug.h"
149#include "opt_multiprocessor.h" 149#include "opt_multiprocessor.h"
150#include "opt_xen.h" 150#include "opt_xen.h"
151#if !defined(__x86_64__) 151#if !defined(__x86_64__)
152#include "opt_kstack_dr0.h" 152#include "opt_kstack_dr0.h"
153#endif /* !defined(__x86_64__) */ 153#endif /* !defined(__x86_64__) */
154 154
155#include <sys/param.h> 155#include <sys/param.h>
156#include <sys/systm.h> 156#include <sys/systm.h>
157#include <sys/proc.h> 157#include <sys/proc.h>
158#include <sys/pool.h> 158#include <sys/pool.h>
159#include <sys/kernel.h> 159#include <sys/kernel.h>
160#include <sys/atomic.h> 160#include <sys/atomic.h>
161#include <sys/cpu.h> 161#include <sys/cpu.h>
162#include <sys/intr.h> 162#include <sys/intr.h>
163#include <sys/xcall.h> 163#include <sys/xcall.h>
164 164
165#include <uvm/uvm.h> 165#include <uvm/uvm.h>
166 166
167#include <dev/isa/isareg.h> 167#include <dev/isa/isareg.h>
168 168
169#include <machine/specialreg.h> 169#include <machine/specialreg.h>
170#include <machine/gdt.h> 170#include <machine/gdt.h>
171#include <machine/isa_machdep.h> 171#include <machine/isa_machdep.h>
172#include <machine/cpuvar.h> 172#include <machine/cpuvar.h>
173 173
174#include <x86/pmap.h> 174#include <x86/pmap.h>
175#include <x86/pmap_pv.h> 175#include <x86/pmap_pv.h>
176 176
177#include <x86/i82489reg.h> 177#include <x86/i82489reg.h>
178#include <x86/i82489var.h> 178#include <x86/i82489var.h>
179 179
180#ifdef XEN 180#ifdef XEN
181#include <xen/xen3-public/xen.h> 181#include <xen/xen3-public/xen.h>
182#include <xen/hypervisor.h> 182#include <xen/hypervisor.h>
183#endif 183#endif
184 184
185/* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 185/* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
186#if defined(XEN) && defined(__x86_64__) 186#if defined(XEN) && defined(__x86_64__)
187#define PG_k PG_u 187#define PG_k PG_u
188#else 188#else
189#define PG_k 0 189#define PG_k 0
190#endif 190#endif
191 191
192/* 192/*
193 * general info: 193 * general info:
194 * 194 *
195 * - for an explanation of how the i386 MMU hardware works see 195 * - for an explanation of how the i386 MMU hardware works see
196 * the comments in <machine/pte.h>. 196 * the comments in <machine/pte.h>.
197 * 197 *
198 * - for an explanation of the general memory structure used by 198 * - for an explanation of the general memory structure used by
199 * this pmap (including the recursive mapping), see the comments 199 * this pmap (including the recursive mapping), see the comments
200 * in <machine/pmap.h>. 200 * in <machine/pmap.h>.
201 * 201 *
202 * this file contains the code for the "pmap module." the module's 202 * this file contains the code for the "pmap module." the module's
203 * job is to manage the hardware's virtual to physical address mappings. 203 * job is to manage the hardware's virtual to physical address mappings.
204 * note that there are two levels of mapping in the VM system: 204 * note that there are two levels of mapping in the VM system:
205 * 205 *
206 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 206 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
207 * to map ranges of virtual address space to objects/files. for 207 * to map ranges of virtual address space to objects/files. for
208 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 208 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
209 * to the file /bin/ls starting at offset zero." note that 209 * to the file /bin/ls starting at offset zero." note that
210 * the upper layer mapping is not concerned with how individual 210 * the upper layer mapping is not concerned with how individual
211 * vm_pages are mapped. 211 * vm_pages are mapped.
212 * 212 *
213 * [2] the lower layer of the VM system (the pmap) maintains the mappings 213 * [2] the lower layer of the VM system (the pmap) maintains the mappings
214 * from virtual addresses. it is concerned with which vm_page is 214 * from virtual addresses. it is concerned with which vm_page is
215 * mapped where. for example, when you run /bin/ls and start 215 * mapped where. for example, when you run /bin/ls and start
216 * at page 0x1000 the fault routine may lookup the correct page 216 * at page 0x1000 the fault routine may lookup the correct page
217 * of the /bin/ls file and then ask the pmap layer to establish 217 * of the /bin/ls file and then ask the pmap layer to establish
218 * a mapping for it. 218 * a mapping for it.
219 * 219 *
220 * note that information in the lower layer of the VM system can be 220 * note that information in the lower layer of the VM system can be
221 * thrown away since it can easily be reconstructed from the info 221 * thrown away since it can easily be reconstructed from the info
222 * in the upper layer. 222 * in the upper layer.
223 * 223 *
224 * data structures we use include: 224 * data structures we use include:
225 * 225 *
226 * - struct pmap: describes the address space of one thread 226 * - struct pmap: describes the address space of one thread
227 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 227 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
228 * - struct pv_head: there is one pv_head per managed page of 228 * - struct pv_head: there is one pv_head per managed page of
229 * physical memory. the pv_head points to a list of pv_entry 229 * physical memory. the pv_head points to a list of pv_entry
230 * structures which describe all the <PMAP,VA> pairs that this 230 * structures which describe all the <PMAP,VA> pairs that this
231 * page is mapped in. this is critical for page based operations 231 * page is mapped in. this is critical for page based operations
232 * such as pmap_page_protect() [change protection on _all_ mappings 232 * such as pmap_page_protect() [change protection on _all_ mappings
233 * of a page] 233 * of a page]
234 */ 234 */
235 235
236/* 236/*
237 * memory allocation 237 * memory allocation
238 * 238 *
239 * - there are three data structures that we must dynamically allocate: 239 * - there are three data structures that we must dynamically allocate:
240 * 240 *
241 * [A] new process' page directory page (PDP) 241 * [A] new process' page directory page (PDP)
242 * - plan 1: done at pmap_create() we use 242 * - plan 1: done at pmap_create() we use
243 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 243 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this
244 * allocation. 244 * allocation.
245 * 245 *
246 * if we are low in free physical memory then we sleep in 246 * if we are low in free physical memory then we sleep in
247 * uvm_km_alloc -- in this case this is ok since we are creating 247 * uvm_km_alloc -- in this case this is ok since we are creating
248 * a new pmap and should not be holding any locks. 248 * a new pmap and should not be holding any locks.
249 * 249 *
250 * if the kernel is totally out of virtual space 250 * if the kernel is totally out of virtual space
251 * (i.e. uvm_km_alloc returns NULL), then we panic. 251 * (i.e. uvm_km_alloc returns NULL), then we panic.
252 * 252 *
253 * [B] new page tables pages (PTP) 253 * [B] new page tables pages (PTP)
254 * - call uvm_pagealloc() 254 * - call uvm_pagealloc()
255 * => success: zero page, add to pm_pdir 255 * => success: zero page, add to pm_pdir
256 * => failure: we are out of free vm_pages, let pmap_enter() 256 * => failure: we are out of free vm_pages, let pmap_enter()
257 * tell UVM about it. 257 * tell UVM about it.
258 * 258 *
259 * note: for kernel PTPs, we start with NKPTP of them. as we map 259 * note: for kernel PTPs, we start with NKPTP of them. as we map
260 * kernel memory (at uvm_map time) we check to see if we've grown 260 * kernel memory (at uvm_map time) we check to see if we've grown
261 * the kernel pmap. if so, we call the optional function 261 * the kernel pmap. if so, we call the optional function
262 * pmap_growkernel() to grow the kernel PTPs in advance. 262 * pmap_growkernel() to grow the kernel PTPs in advance.
263 * 263 *
264 * [C] pv_entry structures 264 * [C] pv_entry structures
265 */ 265 */
266 266
267/* 267/*
268 * locking 268 * locking
269 * 269 *
270 * we have the following locks that we must contend with: 270 * we have the following locks that we must contend with:
271 * 271 *
272 * mutexes: 272 * mutexes:
273 * 273 *
274 * - pmap lock (per pmap, part of uvm_object) 274 * - pmap lock (per pmap, part of uvm_object)
275 * this lock protects the fields in the pmap structure including 275 * this lock protects the fields in the pmap structure including
276 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 276 * the non-kernel PDEs in the PDP, and the PTEs. it also locks
277 * in the alternate PTE space (since that is determined by the 277 * in the alternate PTE space (since that is determined by the
278 * entry in the PDP). 278 * entry in the PDP).
279 * 279 *
280 * - pvh_lock (per pv_head) 280 * - pvh_lock (per pv_head)
281 * this lock protects the pv_entry list which is chained off the 281 * this lock protects the pv_entry list which is chained off the
282 * pv_head structure for a specific managed PA. it is locked 282 * pv_head structure for a specific managed PA. it is locked
283 * when traversing the list (e.g. adding/removing mappings, 283 * when traversing the list (e.g. adding/removing mappings,
284 * syncing R/M bits, etc.) 284 * syncing R/M bits, etc.)
285 * 285 *
286 * - pmaps_lock 286 * - pmaps_lock
287 * this lock protects the list of active pmaps (headed by "pmaps"). 287 * this lock protects the list of active pmaps (headed by "pmaps").
288 * we lock it when adding or removing pmaps from this list. 288 * we lock it when adding or removing pmaps from this list.
289 * 289 *
290 * tlb shootdown 290 * tlb shootdown
291 * 291 *
292 * tlb shootdowns are hard interrupts that operate outside the spl 292 * tlb shootdowns are hard interrupts that operate outside the spl
293 * framework: they don't need to be blocked provided that the pmap module 293 * framework: they don't need to be blocked provided that the pmap module
294 * gets the order of events correct. the calls are made by talking directly 294 * gets the order of events correct. the calls are made by talking directly
295 * to the lapic. the stubs to handle the interrupts are quite short and do 295 * to the lapic. the stubs to handle the interrupts are quite short and do
296 * one of the following: invalidate a single page, a range of pages, all 296 * one of the following: invalidate a single page, a range of pages, all
297 * user tlb entries or the entire tlb. 297 * user tlb entries or the entire tlb.
298 *  298 *
299 * the cpus synchronize with each other using pmap_mbox structures which are 299 * the cpus synchronize with each other using pmap_mbox structures which are
300 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 300 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap
301 * use a global mailbox and are generated using a broadcast ipi (broadcast 301 * use a global mailbox and are generated using a broadcast ipi (broadcast
302 * to all but the sending cpu). shootdowns against regular pmaps use 302 * to all but the sending cpu). shootdowns against regular pmaps use
303 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 303 * per-cpu mailboxes and are multicast. kernel and user shootdowns can
304 * execute simultaneously, as can shootdowns within different multithreaded 304 * execute simultaneously, as can shootdowns within different multithreaded
305 * processes. TODO: 305 * processes. TODO:
306 *  306 *
307 * 1. figure out which waitpoints can be deferered to pmap_update(). 307 * 1. figure out which waitpoints can be deferered to pmap_update().
308 * 2. see if there is a cheap way to batch some updates. 308 * 2. see if there is a cheap way to batch some updates.
309 */ 309 */
310 310
311const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 311const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
312const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 312const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
313const long nkptpmax[] = NKPTPMAX_INITIALIZER; 313const long nkptpmax[] = NKPTPMAX_INITIALIZER;
314const long nbpd[] = NBPD_INITIALIZER; 314const long nbpd[] = NBPD_INITIALIZER;
315pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 315pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
316pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 316pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
317 317
318long nkptp[] = NKPTP_INITIALIZER; 318long nkptp[] = NKPTP_INITIALIZER;
319 319
320static kmutex_t pmaps_lock; 320static kmutex_t pmaps_lock;
321 321
322static vaddr_t pmap_maxkvaddr; 322static vaddr_t pmap_maxkvaddr;
323 323
324#define COUNT(x) /* nothing */ 324#define COUNT(x) /* nothing */
325 325
326/* 326/*
327 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 327 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
328 * actual locking is done by pm_lock. 328 * actual locking is done by pm_lock.
329 */ 329 */
330#if defined(DIAGNOSTIC) 330#if defined(DIAGNOSTIC)
331#define PMAP_SUBOBJ_LOCK(pm, idx) \ 331#define PMAP_SUBOBJ_LOCK(pm, idx) \
332 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 332 KASSERT(mutex_owned(&(pm)->pm_lock)); \
333 if ((idx) != 0) \ 333 if ((idx) != 0) \
334 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 334 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
335#define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 335#define PMAP_SUBOBJ_UNLOCK(pm, idx) \
336 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 336 KASSERT(mutex_owned(&(pm)->pm_lock)); \
337 if ((idx) != 0) \ 337 if ((idx) != 0) \
338 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 338 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
339#else /* defined(DIAGNOSTIC) */ 339#else /* defined(DIAGNOSTIC) */
340#define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 340#define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */
341#define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 341#define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */
342#endif /* defined(DIAGNOSTIC) */ 342#endif /* defined(DIAGNOSTIC) */
343 343
344/* 344/*
345 * Misc. event counters. 345 * Misc. event counters.
346 */ 346 */
347struct evcnt pmap_iobmp_evcnt; 347struct evcnt pmap_iobmp_evcnt;
348struct evcnt pmap_ldt_evcnt; 348struct evcnt pmap_ldt_evcnt;
349 349
350/* 350/*
351 * Global TLB shootdown mailbox. 351 * Global TLB shootdown mailbox.
352 */ 352 */
353struct evcnt pmap_tlb_evcnt __aligned(64); 353struct evcnt pmap_tlb_evcnt __aligned(64);
354struct pmap_mbox pmap_mbox __aligned(64); 354struct pmap_mbox pmap_mbox __aligned(64);
355 355
356/* 356/*
357 * PAT 357 * PAT
358 */ 358 */
359#define PATENTRY(n, type) (type << ((n) * 8)) 359#define PATENTRY(n, type) (type << ((n) * 8))
360#define PAT_UC 0x0ULL 360#define PAT_UC 0x0ULL
361#define PAT_WC 0x1ULL 361#define PAT_WC 0x1ULL
362#define PAT_WT 0x4ULL 362#define PAT_WT 0x4ULL
363#define PAT_WP 0x5ULL 363#define PAT_WP 0x5ULL
364#define PAT_WB 0x6ULL 364#define PAT_WB 0x6ULL
365#define PAT_UCMINUS 0x7ULL 365#define PAT_UCMINUS 0x7ULL
366 366
367static bool cpu_pat_enabled = false; 367static bool cpu_pat_enabled = false;
368 368
369 369
370/* 370/*
371 * Per-CPU data. The pmap mailbox is cache intensive so gets its 371 * Per-CPU data. The pmap mailbox is cache intensive so gets its
372 * own line. Note that the mailbox must be the first item. 372 * own line. Note that the mailbox must be the first item.
373 */ 373 */
374struct pmap_cpu { 374struct pmap_cpu {
375 /* TLB shootdown */ 375 /* TLB shootdown */
376 struct pmap_mbox pc_mbox; 376 struct pmap_mbox pc_mbox;
377}; 377};
378 378
379union { 379union {
380 struct pmap_cpu pc; 380 struct pmap_cpu pc;
381 uint8_t padding[64]; 381 uint8_t padding[64];
382} pmap_cpu[MAXCPUS] __aligned(64); 382} pmap_cpu[MAXCPUS] __aligned(64);
383 383
384/* 384/*
385 * global data structures 385 * global data structures
386 */ 386 */
387 387
388static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 388static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
389struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 389struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
390 390
391/* 391/*
392 * pmap_pg_g: if our processor supports PG_G in the PTE then we 392 * pmap_pg_g: if our processor supports PG_G in the PTE then we
393 * set pmap_pg_g to PG_G (otherwise it is zero). 393 * set pmap_pg_g to PG_G (otherwise it is zero).
394 */ 394 */
395 395
396int pmap_pg_g = 0; 396int pmap_pg_g = 0;
397 397
398/* 398/*
399 * pmap_largepages: if our processor supports PG_PS and we are 399 * pmap_largepages: if our processor supports PG_PS and we are
400 * using it, this is set to true. 400 * using it, this is set to true.
401 */ 401 */
402 402
403int pmap_largepages; 403int pmap_largepages;
404 404
405/* 405/*
406 * i386 physical memory comes in a big contig chunk with a small 406 * i386 physical memory comes in a big contig chunk with a small
407 * hole toward the front of it... the following two paddr_t's 407 * hole toward the front of it... the following two paddr_t's
408 * (shared with machdep.c) describe the physical address space 408 * (shared with machdep.c) describe the physical address space
409 * of this machine. 409 * of this machine.
410 */ 410 */
411paddr_t avail_start; /* PA of first available physical page */ 411paddr_t avail_start; /* PA of first available physical page */
412paddr_t avail_end; /* PA of last available physical page */ 412paddr_t avail_end; /* PA of last available physical page */
413 413
414#ifdef XEN 414#ifdef XEN
415#ifdef __x86_64__ 415#ifdef __x86_64__
416/* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 416/* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
417static paddr_t xen_dummy_user_pgd; 417static paddr_t xen_dummy_user_pgd;
418#endif /* __x86_64__ */ 418#endif /* __x86_64__ */
419paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 419paddr_t pmap_pa_start; /* PA of first physical page for this domain */
420paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 420paddr_t pmap_pa_end; /* PA of last physical page for this domain */
421#endif /* XEN */ 421#endif /* XEN */
422 422
423#define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 423#define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp)
424 424
425#define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 425#define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock)
426#define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 426#define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock)
427#define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 427#define pp_locked(pp) mutex_owned(&(pp)->pp_lock)
428 428
429#define PV_HASH_SIZE 32768 429#define PV_HASH_SIZE 32768
430#define PV_HASH_LOCK_CNT 32 430#define PV_HASH_LOCK_CNT 32
431 431
432struct pv_hash_lock { 432struct pv_hash_lock {
433 kmutex_t lock; 433 kmutex_t lock;
434} __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 434} __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
435 __aligned(CACHE_LINE_SIZE); 435 __aligned(CACHE_LINE_SIZE);
436 436
437struct pv_hash_head { 437struct pv_hash_head {
438 SLIST_HEAD(, pv_entry) hh_list; 438 SLIST_HEAD(, pv_entry) hh_list;
439} pv_hash_heads[PV_HASH_SIZE]; 439} pv_hash_heads[PV_HASH_SIZE];
440 440
441static u_int 441static u_int
442pvhash_hash(struct vm_page *ptp, vaddr_t va) 442pvhash_hash(struct vm_page *ptp, vaddr_t va)
443{ 443{
444 444
445 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 445 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
446} 446}
447 447
448static struct pv_hash_head * 448static struct pv_hash_head *
449pvhash_head(u_int hash) 449pvhash_head(u_int hash)
450{ 450{
451 451
452 return &pv_hash_heads[hash % PV_HASH_SIZE]; 452 return &pv_hash_heads[hash % PV_HASH_SIZE];
453} 453}
454 454
455static kmutex_t * 455static kmutex_t *
456pvhash_lock(u_int hash) 456pvhash_lock(u_int hash)
457{ 457{
458 458
459 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 459 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
460} 460}
461 461
462static struct pv_entry * 462static struct pv_entry *
463pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 463pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
464{ 464{
465 struct pv_entry *pve; 465 struct pv_entry *pve;
466 struct pv_entry *prev; 466 struct pv_entry *prev;
467 467
468 prev = NULL; 468 prev = NULL;
469 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 469 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
470 if (pve->pve_pte.pte_ptp == ptp && 470 if (pve->pve_pte.pte_ptp == ptp &&
471 pve->pve_pte.pte_va == va) { 471 pve->pve_pte.pte_va == va) {
472 if (prev != NULL) { 472 if (prev != NULL) {
473 SLIST_REMOVE_AFTER(prev, pve_hash); 473 SLIST_REMOVE_AFTER(prev, pve_hash);
474 } else { 474 } else {
475 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 475 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
476 } 476 }
477 break; 477 break;
478 } 478 }
479 prev = pve; 479 prev = pve;
480 } 480 }
481 return pve; 481 return pve;
482} 482}
483 483
484/* 484/*
485 * other data structures 485 * other data structures
486 */ 486 */
487 487
488static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 488static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
489static bool pmap_initialized = false; /* pmap_init done yet? */ 489static bool pmap_initialized = false; /* pmap_init done yet? */
490 490
491/* 491/*
492 * the following two vaddr_t's are used during system startup 492 * the following two vaddr_t's are used during system startup
493 * to keep track of how much of the kernel's VM space we have used. 493 * to keep track of how much of the kernel's VM space we have used.
494 * once the system is started, the management of the remaining kernel 494 * once the system is started, the management of the remaining kernel
495 * VM space is turned over to the kernel_map vm_map. 495 * VM space is turned over to the kernel_map vm_map.
496 */ 496 */
497 497
498static vaddr_t virtual_avail; /* VA of first free KVA */ 498static vaddr_t virtual_avail; /* VA of first free KVA */
499static vaddr_t virtual_end; /* VA of last free KVA */ 499static vaddr_t virtual_end; /* VA of last free KVA */
500 500
501/* 501/*
502 * linked list of all non-kernel pmaps 502 * linked list of all non-kernel pmaps
503 */ 503 */
504 504
505static struct pmap_head pmaps; 505static struct pmap_head pmaps;
506 506
507/* 507/*
508 * pool that pmap structures are allocated from 508 * pool that pmap structures are allocated from
509 */ 509 */
510 510
511static struct pool_cache pmap_cache; 511static struct pool_cache pmap_cache;
512 512
513/* 513/*
514 * pv_entry cache 514 * pv_entry cache
515 */ 515 */
516 516
517static struct pool_cache pmap_pv_cache; 517static struct pool_cache pmap_pv_cache;
518 518
519/* 519/*
520 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 520 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
521 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 521 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
522 * due to false sharing. 522 * due to false sharing.
523 */ 523 */
524 524
525#ifdef MULTIPROCESSOR 525#ifdef MULTIPROCESSOR
526#define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 526#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
527#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 527#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
528#else 528#else
529#define PTESLEW(pte, id) (pte) 529#define PTESLEW(pte, id) (pte)
530#define VASLEW(va,id) (va) 530#define VASLEW(va,id) (va)
531#endif 531#endif
532 532
533/* 533/*
534 * special VAs and the PTEs that map them 534 * special VAs and the PTEs that map them
535 */ 535 */
536static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 536static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
537static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 537static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
538 538
539int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 539int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
540 540
541/* 541/*
542 * pool and cache that PDPs are allocated from 542 * pool and cache that PDPs are allocated from
543 */ 543 */
544 544
545static struct pool_cache pmap_pdp_cache; 545static struct pool_cache pmap_pdp_cache;
546int pmap_pdp_ctor(void *, void *, int); 546int pmap_pdp_ctor(void *, void *, int);
547void pmap_pdp_dtor(void *, void *); 547void pmap_pdp_dtor(void *, void *);
548#ifdef PAE 548#ifdef PAE
549/* need to allocate items of 4 pages */ 549/* need to allocate items of 4 pages */
550void *pmap_pdp_alloc(struct pool *, int); 550void *pmap_pdp_alloc(struct pool *, int);
551void pmap_pdp_free(struct pool *, void *); 551void pmap_pdp_free(struct pool *, void *);
552static struct pool_allocator pmap_pdp_allocator = { 552static struct pool_allocator pmap_pdp_allocator = {
553 .pa_alloc = pmap_pdp_alloc, 553 .pa_alloc = pmap_pdp_alloc,
554 .pa_free = pmap_pdp_free, 554 .pa_free = pmap_pdp_free,
555 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 555 .pa_pagesz = PAGE_SIZE * PDP_SIZE,
556}; 556};
557#endif /* PAE */ 557#endif /* PAE */
558 558
559void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 559void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
560 560
561extern vaddr_t idt_vaddr; /* we allocate IDT early */ 561extern vaddr_t idt_vaddr; /* we allocate IDT early */
562extern paddr_t idt_paddr; 562extern paddr_t idt_paddr;
563 563
564#ifdef _LP64 564#ifdef _LP64
565extern vaddr_t lo32_vaddr; 565extern vaddr_t lo32_vaddr;
566extern vaddr_t lo32_paddr; 566extern vaddr_t lo32_paddr;
567#endif 567#endif
568 568
569extern int end; 569extern int end;
570 570
571#ifdef i386 571#ifdef i386
572/* stuff to fix the pentium f00f bug */ 572/* stuff to fix the pentium f00f bug */
573extern vaddr_t pentium_idt_vaddr; 573extern vaddr_t pentium_idt_vaddr;
574#endif 574#endif
575 575
576 576
577/* 577/*
578 * local prototypes 578 * local prototypes
579 */ 579 */
580 580
581static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 581static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
582 pd_entry_t * const *); 582 pd_entry_t * const *);
583static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 583static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
584static void pmap_freepage(struct pmap *, struct vm_page *, int); 584static void pmap_freepage(struct pmap *, struct vm_page *, int);
585static void pmap_free_ptp(struct pmap *, struct vm_page *, 585static void pmap_free_ptp(struct pmap *, struct vm_page *,
586 vaddr_t, pt_entry_t *, 586 vaddr_t, pt_entry_t *,
587 pd_entry_t * const *); 587 pd_entry_t * const *);
588static bool pmap_is_curpmap(struct pmap *); 588static bool pmap_is_curpmap(struct pmap *);
589static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 589static bool pmap_is_active(struct pmap *, struct cpu_info *, bool);
590static bool pmap_remove_pte(struct pmap *, struct vm_page *, 590static bool pmap_remove_pte(struct pmap *, struct vm_page *,
591 pt_entry_t *, vaddr_t, 591 pt_entry_t *, vaddr_t,
592 struct pv_entry **); 592 struct pv_entry **);
593static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 593static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *,
594 vaddr_t, vaddr_t, vaddr_t, 594 vaddr_t, vaddr_t, vaddr_t,
595 struct pv_entry **); 595 struct pv_entry **);
596 596
597static void pmap_unmap_apdp(void); 597static void pmap_unmap_apdp(void);
598static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 598static bool pmap_get_physpage(vaddr_t, int, paddr_t *);
599static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 599static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
600 long *); 600 long *);
601 601
602static bool pmap_reactivate(struct pmap *); 602static bool pmap_reactivate(struct pmap *);
603 603
604/* 604/*
605 * p m a p h e l p e r f u n c t i o n s 605 * p m a p h e l p e r f u n c t i o n s
606 */ 606 */
607 607
608static inline void 608static inline void
609pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 609pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
610{ 610{
611 611
612 if (pmap == pmap_kernel()) { 612 if (pmap == pmap_kernel()) {
613 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 613 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
614 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 614 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
615 } else { 615 } else {
616 KASSERT(mutex_owned(&pmap->pm_lock)); 616 KASSERT(mutex_owned(&pmap->pm_lock));
617 pmap->pm_stats.resident_count += resid_diff; 617 pmap->pm_stats.resident_count += resid_diff;
618 pmap->pm_stats.wired_count += wired_diff; 618 pmap->pm_stats.wired_count += wired_diff;
619 } 619 }
620} 620}
621 621
622static inline void 622static inline void
623pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 623pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
624{ 624{
625 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 625 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
626 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 626 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
627 627
628 KASSERT((npte & (PG_V | PG_W)) != PG_W); 628 KASSERT((npte & (PG_V | PG_W)) != PG_W);
629 KASSERT((opte & (PG_V | PG_W)) != PG_W); 629 KASSERT((opte & (PG_V | PG_W)) != PG_W);
630 630
631 pmap_stats_update(pmap, resid_diff, wired_diff); 631 pmap_stats_update(pmap, resid_diff, wired_diff);
632} 632}
633 633
634/* 634/*
635 * ptp_to_pmap: lookup pmap by ptp 635 * ptp_to_pmap: lookup pmap by ptp
636 */ 636 */
637 637
638static struct pmap * 638static struct pmap *
639ptp_to_pmap(struct vm_page *ptp) 639ptp_to_pmap(struct vm_page *ptp)
640{ 640{
641 struct pmap *pmap; 641 struct pmap *pmap;
642 642
643 if (ptp == NULL) { 643 if (ptp == NULL) {
644 return pmap_kernel(); 644 return pmap_kernel();
645 } 645 }
646 pmap = (struct pmap *)ptp->uobject; 646 pmap = (struct pmap *)ptp->uobject;
647 KASSERT(pmap != NULL); 647 KASSERT(pmap != NULL);
648 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 648 KASSERT(&pmap->pm_obj[0] == ptp->uobject);
649 return pmap; 649 return pmap;
650} 650}
651 651
652static inline struct pv_pte * 652static inline struct pv_pte *
653pve_to_pvpte(struct pv_entry *pve) 653pve_to_pvpte(struct pv_entry *pve)
654{ 654{
655 655
656 KASSERT((void *)&pve->pve_pte == (void *)pve); 656 KASSERT((void *)&pve->pve_pte == (void *)pve);
657 return &pve->pve_pte; 657 return &pve->pve_pte;
658} 658}
659 659
660static inline struct pv_entry * 660static inline struct pv_entry *
661pvpte_to_pve(struct pv_pte *pvpte) 661pvpte_to_pve(struct pv_pte *pvpte)
662{ 662{
663 struct pv_entry *pve = (void *)pvpte; 663 struct pv_entry *pve = (void *)pvpte;
664 664
665 KASSERT(pve_to_pvpte(pve) == pvpte); 665 KASSERT(pve_to_pvpte(pve) == pvpte);
666 return pve; 666 return pve;
667} 667}
668 668
669/* 669/*
670 * pv_pte_first, pv_pte_next: PV list iterator. 670 * pv_pte_first, pv_pte_next: PV list iterator.
671 */ 671 */
672 672
673static struct pv_pte * 673static struct pv_pte *
674pv_pte_first(struct pmap_page *pp) 674pv_pte_first(struct pmap_page *pp)
675{ 675{
676 676
677 KASSERT(pp_locked(pp)); 677 KASSERT(pp_locked(pp));
678 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 678 if ((pp->pp_flags & PP_EMBEDDED) != 0) {
679 return &pp->pp_pte; 679 return &pp->pp_pte;
680 } 680 }
681 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 681 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
682} 682}
683 683
684static struct pv_pte * 684static struct pv_pte *
685pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 685pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
686{ 686{
687 687
688 KASSERT(pvpte != NULL); 688 KASSERT(pvpte != NULL);
689 KASSERT(pp_locked(pp)); 689 KASSERT(pp_locked(pp));
690 if (pvpte == &pp->pp_pte) { 690 if (pvpte == &pp->pp_pte) {
691 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 691 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
692 return NULL; 692 return NULL;
693 } 693 }
694 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 694 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
695 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 695 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
696} 696}
697 697
698/* 698/*
699 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 699 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
700 * of course the kernel is always loaded 700 * of course the kernel is always loaded
701 */ 701 */
702 702
703inline static bool 703inline static bool
704pmap_is_curpmap(struct pmap *pmap) 704pmap_is_curpmap(struct pmap *pmap)
705{ 705{
706#if defined(XEN) && defined(__x86_64__) 706#if defined(XEN) && defined(__x86_64__)
707 /* 707 /*
708 * Only kernel pmap is physically loaded. 708 * Only kernel pmap is physically loaded.
709 * User PGD may be active, but TLB will be flushed 709 * User PGD may be active, but TLB will be flushed
710 * with HYPERVISOR_iret anyway, so let's say no 710 * with HYPERVISOR_iret anyway, so let's say no
711 */ 711 */
712 return(pmap == pmap_kernel()); 712 return(pmap == pmap_kernel());
713#else /* XEN && __x86_64__*/ 713#else /* XEN && __x86_64__*/
714 return((pmap == pmap_kernel()) || 714 return((pmap == pmap_kernel()) ||
715 (pmap == curcpu()->ci_pmap)); 715 (pmap == curcpu()->ci_pmap));
716#endif 716#endif
717} 717}
718 718
719/* 719/*
720 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 720 * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
721 */ 721 */
722 722
723inline static bool 723inline static bool
724pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 724pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
725{ 725{
726 726
727 return (pmap == pmap_kernel() || 727 return (pmap == pmap_kernel() ||
728 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 728 (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
729 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 729 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
730} 730}
731 731
732static void 732static void
733pmap_apte_flush(struct pmap *pmap) 733pmap_apte_flush(struct pmap *pmap)
734{ 734{
735 735
736 KASSERT(kpreempt_disabled()); 736 KASSERT(kpreempt_disabled());
737 737
738 /* 738 /*
739 * Flush the APTE mapping from all other CPUs that 739 * Flush the APTE mapping from all other CPUs that
740 * are using the pmap we are using (who's APTE space 740 * are using the pmap we are using (who's APTE space
741 * is the one we've just modified). 741 * is the one we've just modified).
742 * 742 *
743 * XXXthorpej -- find a way to defer the IPI. 743 * XXXthorpej -- find a way to defer the IPI.
744 */ 744 */
745 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 745 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
746 pmap_tlb_shootwait(); 746 pmap_tlb_shootwait();
747} 747}
748 748
749/* 749/*
750 * Unmap the content of APDP PDEs 750 * Unmap the content of APDP PDEs
751 */ 751 */
752static void 752static void
753pmap_unmap_apdp(void) 753pmap_unmap_apdp(void)
754{ 754{
755 int i; 755 int i;
756 756
757 for (i = 0; i < PDP_SIZE; i++) { 757 for (i = 0; i < PDP_SIZE; i++) {
758 pmap_pte_set(APDP_PDE+i, 0); 758 pmap_pte_set(APDP_PDE+i, 0);
759#if defined (XEN) && defined (PAE) 759#if defined (XEN) && defined (PAE)
760 /* clear shadow entries too */ 760 /* clear shadow entries too */
761 pmap_pte_set(APDP_PDE_SHADOW+i, 0); 761 pmap_pte_set(APDP_PDE_SHADOW+i, 0);
762#endif 762#endif
763 } 763 }
764} 764}
765 765
766/* 766/*
767 * Add a reference to the specified pmap. 767 * Add a reference to the specified pmap.
768 */ 768 */
769 769
770inline void 770void
771pmap_reference(struct pmap *pmap) 771pmap_reference(struct pmap *pmap)
772{ 772{
773 773
774 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 774 atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
775} 775}
776 776
777/* 777/*
778 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 778 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
779 * 779 *
780 * => we lock enough pmaps to keep things locked in 780 * => we lock enough pmaps to keep things locked in
781 * => must be undone with pmap_unmap_ptes before returning 781 * => must be undone with pmap_unmap_ptes before returning
782 */ 782 */
783 783
784void 784void
785pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 785pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
786 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 786 pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
787{ 787{
788 pd_entry_t opde, npde; 788 pd_entry_t opde, npde;
789 struct pmap *ourpmap; 789 struct pmap *ourpmap;
790 struct cpu_info *ci; 790 struct cpu_info *ci;
791 struct lwp *l; 791 struct lwp *l;
792 bool iscurrent; 792 bool iscurrent;
793 uint64_t ncsw; 793 uint64_t ncsw;
794#ifdef XEN 794#ifdef XEN
795 int s, i; 795 int s, i;
796#endif 796#endif
797 797
798 /* the kernel's pmap is always accessible */ 798 /* the kernel's pmap is always accessible */
799 if (pmap == pmap_kernel()) { 799 if (pmap == pmap_kernel()) {
800 *pmap2 = NULL; 800 *pmap2 = NULL;
801 *ptepp = PTE_BASE; 801 *ptepp = PTE_BASE;
802 *pdeppp = normal_pdes; 802 *pdeppp = normal_pdes;
803 return; 803 return;
804 } 804 }
805 KASSERT(kpreempt_disabled()); 805 KASSERT(kpreempt_disabled());
806 806
807 retry: 807 retry:
808 l = curlwp; 808 l = curlwp;
809 ncsw = l->l_ncsw; 809 ncsw = l->l_ncsw;
810 ourpmap = NULL; 810 ourpmap = NULL;
811 ci = curcpu(); 811 ci = curcpu();
812#if defined(XEN) && defined(__x86_64__) 812#if defined(XEN) && defined(__x86_64__)
813 /* 813 /*
814 * curmap can only be pmap_kernel so at this point 814 * curmap can only be pmap_kernel so at this point
815 * pmap_is_curpmap is always false 815 * pmap_is_curpmap is always false
816 */ 816 */
817 iscurrent = 0; 817 iscurrent = 0;
818 ourpmap = pmap_kernel(); 818 ourpmap = pmap_kernel();
819#else /* XEN && __x86_64__*/ 819#else /* XEN && __x86_64__*/
820 if (ci->ci_want_pmapload && 820 if (ci->ci_want_pmapload &&
821 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 821 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
822 pmap_load(); 822 pmap_load();
823 if (l->l_ncsw != ncsw) 823 if (l->l_ncsw != ncsw)
824 goto retry; 824 goto retry;
825 } 825 }
826 iscurrent = pmap_is_curpmap(pmap); 826 iscurrent = pmap_is_curpmap(pmap);
827 /* if curpmap then we are always mapped */ 827 /* if curpmap then we are always mapped */
828 if (iscurrent) { 828 if (iscurrent) {
829 mutex_enter(&pmap->pm_lock); 829 mutex_enter(&pmap->pm_lock);
830 *pmap2 = NULL; 830 *pmap2 = NULL;
831 *ptepp = PTE_BASE; 831 *ptepp = PTE_BASE;
832 *pdeppp = normal_pdes; 832 *pdeppp = normal_pdes;
833 goto out; 833 goto out;
834 } 834 }
835 ourpmap = ci->ci_pmap; 835 ourpmap = ci->ci_pmap;
836#endif /* XEN && __x86_64__ */ 836#endif /* XEN && __x86_64__ */
837 837
838 /* need to lock both curpmap and pmap: use ordered locking */ 838 /* need to lock both curpmap and pmap: use ordered locking */
839 pmap_reference(ourpmap); 839 pmap_reference(ourpmap);
840 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 840 if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
841 mutex_enter(&pmap->pm_lock); 841 mutex_enter(&pmap->pm_lock);
842 mutex_enter(&ourpmap->pm_lock); 842 mutex_enter(&ourpmap->pm_lock);
843 } else { 843 } else {
844 mutex_enter(&ourpmap->pm_lock); 844 mutex_enter(&ourpmap->pm_lock);
845 mutex_enter(&pmap->pm_lock); 845 mutex_enter(&pmap->pm_lock);
846 } 846 }
847 847
848 if (l->l_ncsw != ncsw) 848 if (l->l_ncsw != ncsw)
849 goto unlock_and_retry; 849 goto unlock_and_retry;
850 850
851 /* need to load a new alternate pt space into curpmap? */ 851 /* need to load a new alternate pt space into curpmap? */
852 COUNT(apdp_pde_map); 852 COUNT(apdp_pde_map);
853 opde = *APDP_PDE; 853 opde = *APDP_PDE;
854 if (!pmap_valid_entry(opde) || 854 if (!pmap_valid_entry(opde) ||
855 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 855 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
856#ifdef XEN 856#ifdef XEN
857 s = splvm(); 857 s = splvm();
858 /* Make recursive entry usable in user PGD */ 858 /* Make recursive entry usable in user PGD */
859 for (i = 0; i < PDP_SIZE; i++) { 859 for (i = 0; i < PDP_SIZE; i++) {
860 npde = pmap_pa2pte( 860 npde = pmap_pa2pte(
861 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 861 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
862 xpq_queue_pte_update( 862 xpq_queue_pte_update(
863 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 863 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
864 npde); 864 npde);
865 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 865 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
866 npde); 866 npde);
867#ifdef PAE 867#ifdef PAE
868 /* update shadow entry too */ 868 /* update shadow entry too */
869 xpq_queue_pte_update( 869 xpq_queue_pte_update(
870 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 870 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
871#endif /* PAE */ 871#endif /* PAE */
872 xpq_queue_invlpg( 872 xpq_queue_invlpg(
873 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 873 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
874 } 874 }
875 if (pmap_valid_entry(opde)) 875 if (pmap_valid_entry(opde))
876 pmap_apte_flush(ourpmap); 876 pmap_apte_flush(ourpmap);
877 splx(s); 877 splx(s);
878#else /* XEN */ 878#else /* XEN */
879 int i; 879 int i;
880 for (i = 0; i < PDP_SIZE; i++) { 880 for (i = 0; i < PDP_SIZE; i++) {
881 npde = pmap_pa2pte( 881 npde = pmap_pa2pte(
882 pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V; 882 pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V;
883 pmap_pte_set(APDP_PDE+i, npde); 883 pmap_pte_set(APDP_PDE+i, npde);
884 } 884 }
885 pmap_pte_flush(); 885 pmap_pte_flush();
886 if (pmap_valid_entry(opde)) 886 if (pmap_valid_entry(opde))
887 pmap_apte_flush(ourpmap); 887 pmap_apte_flush(ourpmap);
888#endif /* XEN */ 888#endif /* XEN */
889 } 889 }
890 *pmap2 = ourpmap; 890 *pmap2 = ourpmap;
891 *ptepp = APTE_BASE; 891 *ptepp = APTE_BASE;
892 *pdeppp = alternate_pdes; 892 *pdeppp = alternate_pdes;
893 KASSERT(l->l_ncsw == ncsw); 893 KASSERT(l->l_ncsw == ncsw);
894#if !defined(XEN) || !defined(__x86_64__) 894#if !defined(XEN) || !defined(__x86_64__)
895 out: 895 out:
896#endif 896#endif
897 /* 897 /*
898 * might have blocked, need to retry? 898 * might have blocked, need to retry?
899 */ 899 */
900 if (l->l_ncsw != ncsw) { 900 if (l->l_ncsw != ncsw) {
901 unlock_and_retry: 901 unlock_and_retry:
902 if (ourpmap != NULL) { 902 if (ourpmap != NULL) {
903 mutex_exit(&ourpmap->pm_lock); 903 mutex_exit(&ourpmap->pm_lock);
904 pmap_destroy(ourpmap); 904 pmap_destroy(ourpmap);
905 } 905 }
906 mutex_exit(&pmap->pm_lock); 906 mutex_exit(&pmap->pm_lock);
907 goto retry; 907 goto retry;
908 } 908 }
909 909
910 return; 910 return;
911} 911}
912 912
913/* 913/*
914 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 914 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
915 */ 915 */
916 916
917void 917void
918pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 918pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
919{ 919{
920 920
921 if (pmap == pmap_kernel()) { 921 if (pmap == pmap_kernel()) {
922 return; 922 return;
923 } 923 }
924 KASSERT(kpreempt_disabled()); 924 KASSERT(kpreempt_disabled());
925 if (pmap2 == NULL) { 925 if (pmap2 == NULL) {
926 mutex_exit(&pmap->pm_lock); 926 mutex_exit(&pmap->pm_lock);
927 } else { 927 } else {
928#if defined(XEN) && defined(__x86_64__) 928#if defined(XEN) && defined(__x86_64__)
929 KASSERT(pmap2 == pmap_kernel()); 929 KASSERT(pmap2 == pmap_kernel());
930#else 930#else
931 KASSERT(curcpu()->ci_pmap == pmap2); 931 KASSERT(curcpu()->ci_pmap == pmap2);
932#endif 932#endif
933#if defined(MULTIPROCESSOR) 933#if defined(MULTIPROCESSOR)
934 pmap_unmap_apdp(); 934 pmap_unmap_apdp();
935 pmap_pte_flush(); 935 pmap_pte_flush();
936 pmap_apte_flush(pmap2); 936 pmap_apte_flush(pmap2);
937#endif 937#endif
938 COUNT(apdp_pde_unmap); 938 COUNT(apdp_pde_unmap);
939 mutex_exit(&pmap->pm_lock); 939 mutex_exit(&pmap->pm_lock);
940 mutex_exit(&pmap2->pm_lock); 940 mutex_exit(&pmap2->pm_lock);
941 pmap_destroy(pmap2); 941 pmap_destroy(pmap2);
942 } 942 }
943} 943}
944 944
945inline static void 945inline static void
946pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 946pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
947{ 947{
948 948
949#if !defined(__x86_64__) 949#if !defined(__x86_64__)
950 if (curproc == NULL || curproc->p_vmspace == NULL || 950 if (curproc == NULL || curproc->p_vmspace == NULL ||
951 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 951 pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
952 return; 952 return;
953 953
954 if ((opte ^ npte) & PG_X) 954 if ((opte ^ npte) & PG_X)
955 pmap_update_pg(va); 955 pmap_update_pg(va);
956 956
957 /* 957 /*
958 * Executability was removed on the last executable change. 958 * Executability was removed on the last executable change.
959 * Reset the code segment to something conservative and 959 * Reset the code segment to something conservative and
960 * let the trap handler deal with setting the right limit. 960 * let the trap handler deal with setting the right limit.
961 * We can't do that because of locking constraints on the vm map. 961 * We can't do that because of locking constraints on the vm map.
962 */ 962 */
963 963
964 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 964 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
965 struct trapframe *tf = curlwp->l_md.md_regs; 965 struct trapframe *tf = curlwp->l_md.md_regs;
966 966
967 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 967 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
968 pm->pm_hiexec = I386_MAX_EXE_ADDR; 968 pm->pm_hiexec = I386_MAX_EXE_ADDR;
969 } 969 }
970#endif /* !defined(__x86_64__) */ 970#endif /* !defined(__x86_64__) */
971} 971}
972 972
973#if !defined(__x86_64__) 973#if !defined(__x86_64__)
974/* 974/*
975 * Fixup the code segment to cover all potential executable mappings. 975 * Fixup the code segment to cover all potential executable mappings.
976 * returns 0 if no changes to the code segment were made. 976 * returns 0 if no changes to the code segment were made.
977 */ 977 */
978 978
979int 979int
980pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 980pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
981{ 981{
982 struct vm_map_entry *ent; 982 struct vm_map_entry *ent;
983 struct pmap *pm = vm_map_pmap(map); 983 struct pmap *pm = vm_map_pmap(map);
984 vaddr_t va = 0; 984 vaddr_t va = 0;
985 985
986 vm_map_lock_read(map); 986 vm_map_lock_read(map);
987 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 987 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
988 988
989 /* 989 /*
990 * This entry has greater va than the entries before. 990 * This entry has greater va than the entries before.
991 * We need to make it point to the last page, not past it. 991 * We need to make it point to the last page, not past it.
992 */ 992 */
993 993
994 if (ent->protection & VM_PROT_EXECUTE) 994 if (ent->protection & VM_PROT_EXECUTE)
995 va = trunc_page(ent->end) - PAGE_SIZE; 995 va = trunc_page(ent->end) - PAGE_SIZE;
996 } 996 }
997 vm_map_unlock_read(map); 997 vm_map_unlock_read(map);
998 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 998 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
999 return (0); 999 return (0);
1000 1000
1001 pm->pm_hiexec = va; 1001 pm->pm_hiexec = va;
1002 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 1002 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
1003 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 1003 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
1004 } else { 1004 } else {
1005 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 1005 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1006 return (0); 1006 return (0);
1007 } 1007 }
1008 return (1); 1008 return (1);
1009} 1009}
1010#endif /* !defined(__x86_64__) */ 1010#endif /* !defined(__x86_64__) */
1011 1011
1012void 1012void
1013pat_init(struct cpu_info *ci) 1013pat_init(struct cpu_info *ci)
1014{ 1014{
1015 uint64_t pat; 1015 uint64_t pat;
1016 1016
1017 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 1017 if (!(ci->ci_feat_val[0] & CPUID_PAT))
1018 return; 1018 return;
1019 1019
1020 /* We change WT to WC. Leave all other entries the default values. */ 1020 /* We change WT to WC. Leave all other entries the default values. */
1021 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 1021 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
1022 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 1022 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
1023 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 1023 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
1024 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 1024 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
1025 1025
1026 wrmsr(MSR_CR_PAT, pat); 1026 wrmsr(MSR_CR_PAT, pat);
1027 cpu_pat_enabled = true; 1027 cpu_pat_enabled = true;
1028 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 1028 aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
1029} 1029}
1030 1030
1031static pt_entry_t 1031static pt_entry_t
1032pmap_pat_flags(u_int flags) 1032pmap_pat_flags(u_int flags)
1033{ 1033{
1034 u_int cacheflags = (flags & PMAP_CACHE_MASK); 1034 u_int cacheflags = (flags & PMAP_CACHE_MASK);
1035 1035
1036 if (!cpu_pat_enabled) { 1036 if (!cpu_pat_enabled) {
1037 switch (cacheflags) { 1037 switch (cacheflags) {
1038 case PMAP_NOCACHE: 1038 case PMAP_NOCACHE:
1039 case PMAP_NOCACHE_OVR: 1039 case PMAP_NOCACHE_OVR:
1040 /* results in PGC_UCMINUS on cpus which have 1040 /* results in PGC_UCMINUS on cpus which have
1041 * the cpuid PAT but PAT "disabled" 1041 * the cpuid PAT but PAT "disabled"
1042 */ 1042 */
1043 return PG_N; 1043 return PG_N;
1044 default: 1044 default:
1045 return 0; 1045 return 0;
1046 } 1046 }
1047 } 1047 }
1048 1048
1049 switch (cacheflags) { 1049 switch (cacheflags) {
1050 case PMAP_NOCACHE: 1050 case PMAP_NOCACHE:
1051 return PGC_UC; 1051 return PGC_UC;
1052 case PMAP_WRITE_COMBINE: 1052 case PMAP_WRITE_COMBINE:
1053 return PGC_WC; 1053 return PGC_WC;
1054 case PMAP_WRITE_BACK: 1054 case PMAP_WRITE_BACK:
1055 return PGC_WB; 1055 return PGC_WB;
1056 case PMAP_NOCACHE_OVR: 1056 case PMAP_NOCACHE_OVR:
1057 return PGC_UCMINUS; 1057 return PGC_UCMINUS;
1058 } 1058 }
1059 1059
1060 return 0; 1060 return 0;
1061} 1061}
1062 1062
1063/* 1063/*
1064 * p m a p k e n t e r f u n c t i o n s 1064 * p m a p k e n t e r f u n c t i o n s
1065 * 1065 *
1066 * functions to quickly enter/remove pages from the kernel address 1066 * functions to quickly enter/remove pages from the kernel address
1067 * space. pmap_kremove is exported to MI kernel. we make use of 1067 * space. pmap_kremove is exported to MI kernel. we make use of
1068 * the recursive PTE mappings. 1068 * the recursive PTE mappings.
1069 */ 1069 */
1070 1070
1071/* 1071/*
1072 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1072 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1073 * 1073 *
1074 * => no need to lock anything, assume va is already allocated 1074 * => no need to lock anything, assume va is already allocated
1075 * => should be faster than normal pmap enter function 1075 * => should be faster than normal pmap enter function
1076 */ 1076 */
1077 1077
1078void 1078void
1079pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1079pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1080{ 1080{
1081 pt_entry_t *pte, opte, npte; 1081 pt_entry_t *pte, opte, npte;
1082 1082
1083 KASSERT(!(prot & ~VM_PROT_ALL)); 1083 KASSERT(!(prot & ~VM_PROT_ALL));
1084 1084
1085 if (va < VM_MIN_KERNEL_ADDRESS) 1085 if (va < VM_MIN_KERNEL_ADDRESS)
1086 pte = vtopte(va); 1086 pte = vtopte(va);
1087 else 1087 else
1088 pte = kvtopte(va); 1088 pte = kvtopte(va);
1089#ifdef DOM0OPS 1089#ifdef DOM0OPS
1090 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1090 if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1091#ifdef DEBUG 1091#ifdef DEBUG
1092 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1092 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1093 " outside range\n", (int64_t)pa, (int64_t)va); 1093 " outside range\n", (int64_t)pa, (int64_t)va);
1094#endif /* DEBUG */ 1094#endif /* DEBUG */
1095 npte = pa; 1095 npte = pa;
1096 } else 1096 } else
1097#endif /* DOM0OPS */ 1097#endif /* DOM0OPS */
1098 npte = pmap_pa2pte(pa); 1098 npte = pmap_pa2pte(pa);
1099 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1099 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1100 npte |= pmap_pat_flags(flags); 1100 npte |= pmap_pat_flags(flags);
1101 opte = pmap_pte_testset(pte, npte); /* zap! */ 1101 opte = pmap_pte_testset(pte, npte); /* zap! */
1102#if defined(DIAGNOSTIC) 1102#if defined(DIAGNOSTIC)
1103 /* XXX For now... */ 1103 /* XXX For now... */
1104 if (opte & PG_PS) 1104 if (opte & PG_PS)
1105 panic("pmap_kenter_pa: PG_PS"); 1105 panic("pmap_kenter_pa: PG_PS");
1106#endif 1106#endif
1107 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1107 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1108 /* This should not happen, so no need to batch updates. */ 1108 /* This should not happen, so no need to batch updates. */
1109 kpreempt_disable(); 1109 kpreempt_disable();
1110 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1110 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1111 kpreempt_enable(); 1111 kpreempt_enable();
1112 } 1112 }
1113} 1113}
1114 1114
1115void 1115void
1116pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1116pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1117{ 1117{
1118 pt_entry_t *pte, opte, npte; 1118 pt_entry_t *pte, opte, npte;
1119 1119
1120 KASSERT((prot & ~VM_PROT_ALL) == 0); 1120 KASSERT((prot & ~VM_PROT_ALL) == 0);
1121 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1121 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1122 1122
1123#ifdef DOM0OPS 1123#ifdef DOM0OPS
1124 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1124 if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1125 npte = pa; 1125 npte = pa;
1126 } else 1126 } else
1127#endif 1127#endif
1128 npte = pmap_pa2pte(pa); 1128 npte = pmap_pa2pte(pa);
1129 1129
1130 npte = pmap_pa2pte(pa); 1130 npte = pmap_pa2pte(pa);
1131 npte |= protection_codes[prot] | PG_k | PG_V; 1131 npte |= protection_codes[prot] | PG_k | PG_V;
1132 opte = pmap_pte_testset(pte, npte); 1132 opte = pmap_pte_testset(pte, npte);
1133} 1133}
1134 1134
1135/* 1135/*
1136 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1136 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1137 */ 1137 */
1138void 1138void
1139pmap_emap_sync(bool canload) 1139pmap_emap_sync(bool canload)
1140{ 1140{
1141 struct cpu_info *ci = curcpu(); 1141 struct cpu_info *ci = curcpu();
1142 struct pmap *pmap; 1142 struct pmap *pmap;
1143 1143
1144 KASSERT(kpreempt_disabled()); 1144 KASSERT(kpreempt_disabled());
1145 if (__predict_true(ci->ci_want_pmapload && canload)) { 1145 if (__predict_true(ci->ci_want_pmapload && canload)) {
1146 /* 1146 /*
1147 * XXX: Hint for pmap_reactivate(), which might suggest to 1147 * XXX: Hint for pmap_reactivate(), which might suggest to
1148 * not perform TLB flush, if state has not changed. 1148 * not perform TLB flush, if state has not changed.
1149 */ 1149 */
1150 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1150 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1151 if (__predict_false(pmap == ci->ci_pmap)) { 1151 if (__predict_false(pmap == ci->ci_pmap)) {
1152 const uint32_t cpumask = ci->ci_cpumask; 1152 const uint32_t cpumask = ci->ci_cpumask;
1153 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1153 atomic_and_32(&pmap->pm_cpus, ~cpumask);
1154 } 1154 }
1155 pmap_load(); 1155 pmap_load();
1156 KASSERT(ci->ci_want_pmapload == 0); 1156 KASSERT(ci->ci_want_pmapload == 0);
1157 } else { 1157 } else {
1158 tlbflush(); 1158 tlbflush();
1159 } 1159 }
1160 1160
1161} 1161}
1162 1162
1163void 1163void
1164pmap_emap_remove(vaddr_t sva, vsize_t len) 1164pmap_emap_remove(vaddr_t sva, vsize_t len)
1165{ 1165{
1166 pt_entry_t *pte, xpte; 1166 pt_entry_t *pte, xpte;
1167 vaddr_t va, eva = sva + len; 1167 vaddr_t va, eva = sva + len;
1168 1168
1169 for (va = sva; va < eva; va += PAGE_SIZE) { 1169 for (va = sva; va < eva; va += PAGE_SIZE) {
1170 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1170 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1171 xpte |= pmap_pte_testset(pte, 0); 1171 xpte |= pmap_pte_testset(pte, 0);
1172 } 1172 }
1173} 1173}
1174 1174
1175__weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1175__weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1176 1176
1177#if defined(__x86_64__) 1177#if defined(__x86_64__)
1178/* 1178/*
1179 * Change protection for a virtual address. Local for a CPU only, don't 1179 * Change protection for a virtual address. Local for a CPU only, don't
1180 * care about TLB shootdowns. 1180 * care about TLB shootdowns.
1181 * 1181 *
1182 * => must be called with preemption disabled 1182 * => must be called with preemption disabled
1183 */ 1183 */
1184void 1184void
1185pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1185pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1186{ 1186{
1187 pt_entry_t *pte, opte, npte; 1187 pt_entry_t *pte, opte, npte;
1188 1188
1189 KASSERT(kpreempt_disabled()); 1189 KASSERT(kpreempt_disabled());
1190 1190
1191 if (va < VM_MIN_KERNEL_ADDRESS) 1191 if (va < VM_MIN_KERNEL_ADDRESS)
1192 pte = vtopte(va); 1192 pte = vtopte(va);
1193 else 1193 else
1194 pte = kvtopte(va); 1194 pte = kvtopte(va);
1195 1195
1196 npte = opte = *pte; 1196 npte = opte = *pte;
1197 1197
1198 if ((prot & VM_PROT_WRITE) != 0) 1198 if ((prot & VM_PROT_WRITE) != 0)
1199 npte |= PG_RW; 1199 npte |= PG_RW;
1200 else 1200 else
1201 npte &= ~PG_RW; 1201 npte &= ~PG_RW;
1202 1202
1203 if (opte != npte) { 1203 if (opte != npte) {
1204 pmap_pte_set(pte, npte); 1204 pmap_pte_set(pte, npte);
1205 pmap_pte_flush(); 1205 pmap_pte_flush();
1206 invlpg(va); 1206 invlpg(va);
1207 } 1207 }
1208} 1208}
1209#endif /* defined(__x86_64__) */ 1209#endif /* defined(__x86_64__) */
1210 1210
1211/* 1211/*
1212 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1212 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1213 * 1213 *
1214 * => no need to lock anything 1214 * => no need to lock anything
1215 * => caller must dispose of any vm_page mapped in the va range 1215 * => caller must dispose of any vm_page mapped in the va range
1216 * => note: not an inline function 1216 * => note: not an inline function
1217 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1217 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1218 * => we assume kernel only unmaps valid addresses and thus don't bother 1218 * => we assume kernel only unmaps valid addresses and thus don't bother
1219 * checking the valid bit before doing TLB flushing 1219 * checking the valid bit before doing TLB flushing
1220 * => must be followed by call to pmap_update() before reuse of page 1220 * => must be followed by call to pmap_update() before reuse of page
1221 */ 1221 */
1222 1222
1223void 1223void
1224pmap_kremove(vaddr_t sva, vsize_t len) 1224pmap_kremove(vaddr_t sva, vsize_t len)
1225{ 1225{
1226 pt_entry_t *pte, xpte; 1226 pt_entry_t *pte, xpte;
1227 vaddr_t va, eva; 1227 vaddr_t va, eva;
1228 1228
1229 eva = sva + len; 1229 eva = sva + len;
1230 xpte = 0; 1230 xpte = 0;
1231 1231
1232 for (va = sva; va < eva; va += PAGE_SIZE) { 1232 for (va = sva; va < eva; va += PAGE_SIZE) {
1233 if (va < VM_MIN_KERNEL_ADDRESS) 1233 if (va < VM_MIN_KERNEL_ADDRESS)
1234 pte = vtopte(va); 1234 pte = vtopte(va);
1235 else 1235 else
1236 pte = kvtopte(va); 1236 pte = kvtopte(va);
1237 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1237 xpte |= pmap_pte_testset(pte, 0); /* zap! */
1238#if defined(DIAGNOSTIC) 1238#if defined(DIAGNOSTIC)
1239 /* XXX For now... */ 1239 /* XXX For now... */
1240 if (xpte & PG_PS) 1240 if (xpte & PG_PS)
1241 panic("pmap_kremove: PG_PS"); 1241 panic("pmap_kremove: PG_PS");
1242 if (xpte & PG_PVLIST) 1242 if (xpte & PG_PVLIST)
1243 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1243 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1244 va); 1244 va);
1245#endif 1245#endif
1246 } 1246 }
1247 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1247 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1248 kpreempt_disable(); 1248 kpreempt_disable();
1249 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1249 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1250 kpreempt_enable(); 1250 kpreempt_enable();
1251 } 1251 }
1252} 1252}
1253 1253
1254/* 1254/*
1255 * p m a p i n i t f u n c t i o n s 1255 * p m a p i n i t f u n c t i o n s
1256 * 1256 *
1257 * pmap_bootstrap and pmap_init are called during system startup 1257 * pmap_bootstrap and pmap_init are called during system startup
1258 * to init the pmap module. pmap_bootstrap() does a low level 1258 * to init the pmap module. pmap_bootstrap() does a low level
1259 * init just to get things rolling. pmap_init() finishes the job. 1259 * init just to get things rolling. pmap_init() finishes the job.
1260 */ 1260 */
1261 1261
1262/* 1262/*
1263 * pmap_bootstrap: get the system in a state where it can run with VM 1263 * pmap_bootstrap: get the system in a state where it can run with VM
1264 * properly enabled (called before main()). the VM system is 1264 * properly enabled (called before main()). the VM system is
1265 * fully init'd later... 1265 * fully init'd later...
1266 * 1266 *
1267 * => on i386, locore.s has already enabled the MMU by allocating 1267 * => on i386, locore.s has already enabled the MMU by allocating
1268 * a PDP for the kernel, and nkpde PTP's for the kernel. 1268 * a PDP for the kernel, and nkpde PTP's for the kernel.
1269 * => kva_start is the first free virtual address in kernel space 1269 * => kva_start is the first free virtual address in kernel space
1270 */ 1270 */
1271 1271
1272void 1272void
1273pmap_bootstrap(vaddr_t kva_start) 1273pmap_bootstrap(vaddr_t kva_start)
1274{ 1274{
1275 struct pmap *kpm; 1275 struct pmap *kpm;
1276 pt_entry_t *pte; 1276 pt_entry_t *pte;
1277 int i; 1277 int i;
1278 vaddr_t kva; 1278 vaddr_t kva;
1279#ifndef XEN 1279#ifndef XEN
1280 unsigned long p1i; 1280 unsigned long p1i;
1281 vaddr_t kva_end; 1281 vaddr_t kva_end;
1282#endif 1282#endif
1283 1283
1284 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1284 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1285 1285
1286 /* 1286 /*
1287 * set up our local static global vars that keep track of the 1287 * set up our local static global vars that keep track of the
1288 * usage of KVM before kernel_map is set up 1288 * usage of KVM before kernel_map is set up
1289 */ 1289 */
1290 1290
1291 virtual_avail = kva_start; /* first free KVA */ 1291 virtual_avail = kva_start; /* first free KVA */
1292 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1292 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
1293 1293
1294 /* 1294 /*
1295 * set up protection_codes: we need to be able to convert from 1295 * set up protection_codes: we need to be able to convert from
1296 * a MI protection code (some combo of VM_PROT...) to something 1296 * a MI protection code (some combo of VM_PROT...) to something
1297 * we can jam into a i386 PTE. 1297 * we can jam into a i386 PTE.
1298 */ 1298 */
1299 1299
1300 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1300 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */
1301 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1301 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */
1302 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1302 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */
1303 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1303 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1304 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1304 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */
1305 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1305 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1306 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1306 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1307 /* wr- */ 1307 /* wr- */
1308 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1308 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */
1309 1309
1310 /* 1310 /*
1311 * now we init the kernel's pmap 1311 * now we init the kernel's pmap
1312 * 1312 *
1313 * the kernel pmap's pm_obj is not used for much. however, in 1313 * the kernel pmap's pm_obj is not used for much. however, in
1314 * user pmaps the pm_obj contains the list of active PTPs. 1314 * user pmaps the pm_obj contains the list of active PTPs.
1315 * the pm_obj currently does not have a pager. it might be possible 1315 * the pm_obj currently does not have a pager. it might be possible
1316 * to add a pager that would allow a process to read-only mmap its 1316 * to add a pager that would allow a process to read-only mmap its
1317 * own page tables (fast user level vtophys?). this may or may not 1317 * own page tables (fast user level vtophys?). this may or may not
1318 * be useful. 1318 * be useful.
1319 */ 1319 */
1320 1320
1321 kpm = pmap_kernel(); 1321 kpm = pmap_kernel();
1322 for (i = 0; i < PTP_LEVELS - 1; i++) { 1322 for (i = 0; i < PTP_LEVELS - 1; i++) {
1323 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1323 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1324 kpm->pm_ptphint[i] = NULL; 1324 kpm->pm_ptphint[i] = NULL;
1325 } 1325 }
1326 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1326 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
1327 1327
1328 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1328 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1329 for (i = 0; i < PDP_SIZE; i++) 1329 for (i = 0; i < PDP_SIZE; i++)
1330 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1330 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1331 1331
1332 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1332 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1333 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1333 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1334 1334
1335 /* 1335 /*
1336 * the above is just a rough estimate and not critical to the proper 1336 * the above is just a rough estimate and not critical to the proper
1337 * operation of the system. 1337 * operation of the system.
1338 */ 1338 */
1339 1339
1340#ifndef XEN 1340#ifndef XEN
1341 /* 1341 /*
1342 * Begin to enable global TLB entries if they are supported. 1342 * Begin to enable global TLB entries if they are supported.
1343 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1343 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1344 * which happens in cpu_init(), which is run on each cpu 1344 * which happens in cpu_init(), which is run on each cpu
1345 * (and happens later) 1345 * (and happens later)
1346 */ 1346 */
1347 1347
1348 if (cpu_feature[0] & CPUID_PGE) { 1348 if (cpu_feature[0] & CPUID_PGE) {
1349 pmap_pg_g = PG_G; /* enable software */ 1349 pmap_pg_g = PG_G; /* enable software */
1350 1350
1351 /* add PG_G attribute to already mapped kernel pages */ 1351 /* add PG_G attribute to already mapped kernel pages */
1352 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1352 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1353 kva_end = virtual_avail; 1353 kva_end = virtual_avail;
1354 } else { 1354 } else {
1355 extern vaddr_t eblob, esym; 1355 extern vaddr_t eblob, esym;
1356 kva_end = (vaddr_t)&end; 1356 kva_end = (vaddr_t)&end;
1357 if (esym > kva_end) 1357 if (esym > kva_end)
1358 kva_end = esym; 1358 kva_end = esym;
1359 if (eblob > kva_end) 1359 if (eblob > kva_end)
1360 kva_end = eblob; 1360 kva_end = eblob;
1361 kva_end = roundup(kva_end, PAGE_SIZE); 1361 kva_end = roundup(kva_end, PAGE_SIZE);
1362 } 1362 }
1363 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1363 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1364 p1i = pl1_i(kva); 1364 p1i = pl1_i(kva);
1365 if (pmap_valid_entry(PTE_BASE[p1i])) 1365 if (pmap_valid_entry(PTE_BASE[p1i]))
1366 PTE_BASE[p1i] |= PG_G; 1366 PTE_BASE[p1i] |= PG_G;
1367 } 1367 }
1368 } 1368 }
1369 1369
1370 /* 1370 /*
1371 * enable large pages if they are supported. 1371 * enable large pages if they are supported.
1372 */ 1372 */
1373 1373
1374 if (cpu_feature[0] & CPUID_PSE) { 1374 if (cpu_feature[0] & CPUID_PSE) {
1375 paddr_t pa; 1375 paddr_t pa;
1376 pd_entry_t *pde; 1376 pd_entry_t *pde;
1377 extern char __data_start; 1377 extern char __data_start;
1378 1378
1379 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1379 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
1380 pmap_largepages = 1; /* enable software */ 1380 pmap_largepages = 1; /* enable software */
1381 1381
1382 /* 1382 /*
1383 * the TLB must be flushed after enabling large pages 1383 * the TLB must be flushed after enabling large pages
1384 * on Pentium CPUs, according to section 3.6.2.2 of 1384 * on Pentium CPUs, according to section 3.6.2.2 of
1385 * "Intel Architecture Software Developer's Manual, 1385 * "Intel Architecture Software Developer's Manual,
1386 * Volume 3: System Programming". 1386 * Volume 3: System Programming".
1387 */ 1387 */
1388 tlbflush(); 1388 tlbflush();
1389 1389
1390 /* 1390 /*
1391 * now, remap the kernel text using large pages. we 1391 * now, remap the kernel text using large pages. we
1392 * assume that the linker has properly aligned the 1392 * assume that the linker has properly aligned the
1393 * .data segment to a NBPD_L2 boundary. 1393 * .data segment to a NBPD_L2 boundary.
1394 */ 1394 */
1395 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1395 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1396 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1396 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1397 kva += NBPD_L2, pa += NBPD_L2) { 1397 kva += NBPD_L2, pa += NBPD_L2) {
1398 pde = &L2_BASE[pl2_i(kva)]; 1398 pde = &L2_BASE[pl2_i(kva)];
1399 *pde = pa | pmap_pg_g | PG_PS | 1399 *pde = pa | pmap_pg_g | PG_PS |
1400 PG_KR | PG_V; /* zap! */ 1400 PG_KR | PG_V; /* zap! */
1401 tlbflush(); 1401 tlbflush();
1402 } 1402 }
1403#if defined(DEBUG) 1403#if defined(DEBUG)
1404 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1404 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1405 "pages and %" PRIuPSIZE " normal pages\n", 1405 "pages and %" PRIuPSIZE " normal pages\n",
1406 howmany(kva - KERNBASE, NBPD_L2), 1406 howmany(kva - KERNBASE, NBPD_L2),
1407 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1407 howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1408#endif /* defined(DEBUG) */ 1408#endif /* defined(DEBUG) */
1409 } 1409 }
1410#endif /* !XEN */ 1410#endif /* !XEN */
1411 1411
1412 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1412 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1413 /* 1413 /*
1414 * zero_pte is stuck at the end of mapped space for the kernel 1414 * zero_pte is stuck at the end of mapped space for the kernel
1415 * image (disjunct from kva space). This is done so that it 1415 * image (disjunct from kva space). This is done so that it
1416 * can safely be used in pmap_growkernel (pmap_get_physpage), 1416 * can safely be used in pmap_growkernel (pmap_get_physpage),
1417 * when it's called for the first time. 1417 * when it's called for the first time.
1418 * XXXfvdl fix this for MULTIPROCESSOR later. 1418 * XXXfvdl fix this for MULTIPROCESSOR later.
1419 */ 1419 */
1420 1420
1421 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1421 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1422 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1422 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1423 } 1423 }
1424 1424
1425 /* 1425 /*
1426 * now we allocate the "special" VAs which are used for tmp mappings 1426 * now we allocate the "special" VAs which are used for tmp mappings
1427 * by the pmap (and other modules). we allocate the VAs by advancing 1427 * by the pmap (and other modules). we allocate the VAs by advancing
1428 * virtual_avail (note that there are no pages mapped at these VAs). 1428 * virtual_avail (note that there are no pages mapped at these VAs).
1429 * we find the PTE that maps the allocated VA via the linear PTE 1429 * we find the PTE that maps the allocated VA via the linear PTE
1430 * mapping. 1430 * mapping.
1431 */ 1431 */
1432 1432
1433 pte = PTE_BASE + pl1_i(virtual_avail); 1433 pte = PTE_BASE + pl1_i(virtual_avail);
1434 1434
1435#ifdef MULTIPROCESSOR 1435#ifdef MULTIPROCESSOR
1436 /* 1436 /*
1437 * Waste some VA space to avoid false sharing of cache lines 1437 * Waste some VA space to avoid false sharing of cache lines
1438 * for page table pages: Give each possible CPU a cache line 1438 * for page table pages: Give each possible CPU a cache line
1439 * of PTE's (8) to play with, though we only need 4. We could 1439 * of PTE's (8) to play with, though we only need 4. We could
1440 * recycle some of this waste by putting the idle stacks here 1440 * recycle some of this waste by putting the idle stacks here
1441 * as well; we could waste less space if we knew the largest 1441 * as well; we could waste less space if we knew the largest
1442 * CPU ID beforehand. 1442 * CPU ID beforehand.
1443 */ 1443 */
1444 csrcp = (char *) virtual_avail; csrc_pte = pte; 1444 csrcp = (char *) virtual_avail; csrc_pte = pte;
1445 1445
1446 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1446 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1;
1447 1447
1448 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1448 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2;
1449 1449
1450 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1450 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3;
1451 1451
1452 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1452 virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1453 pte += maxcpus * NPTECL; 1453 pte += maxcpus * NPTECL;
1454#else 1454#else
1455 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1455 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */
1456 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1456 virtual_avail += PAGE_SIZE; pte++; /* advance */
1457 1457
1458 cdstp = (void *) virtual_avail; cdst_pte = pte; 1458 cdstp = (void *) virtual_avail; cdst_pte = pte;
1459 virtual_avail += PAGE_SIZE; pte++; 1459 virtual_avail += PAGE_SIZE; pte++;
1460 1460
1461 zerop = (void *) virtual_avail; zero_pte = pte; 1461 zerop = (void *) virtual_avail; zero_pte = pte;
1462 virtual_avail += PAGE_SIZE; pte++; 1462 virtual_avail += PAGE_SIZE; pte++;
1463 1463
1464 ptpp = (void *) virtual_avail; ptp_pte = pte; 1464 ptpp = (void *) virtual_avail; ptp_pte = pte;
1465 virtual_avail += PAGE_SIZE; pte++; 1465 virtual_avail += PAGE_SIZE; pte++;
1466#endif 1466#endif
1467 1467
1468 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1468 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1469 early_zerop = zerop; 1469 early_zerop = zerop;
1470 early_zero_pte = zero_pte; 1470 early_zero_pte = zero_pte;
1471 } 1471 }
1472 1472
1473 /* 1473 /*
1474 * Nothing after this point actually needs pte; 1474 * Nothing after this point actually needs pte;
1475 */ 1475 */
1476 pte = (void *)0xdeadbeef; 1476 pte = (void *)0xdeadbeef;
1477 1477
1478 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1478 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1479 /* XXXfvdl PTEs not needed here */ 1479 /* XXXfvdl PTEs not needed here */
1480 vmmap = (char *)virtual_avail; /* don't need pte */ 1480 vmmap = (char *)virtual_avail; /* don't need pte */
1481 virtual_avail += PAGE_SIZE; pte++; 1481 virtual_avail += PAGE_SIZE; pte++;
1482 1482
1483#ifdef XEN 1483#ifdef XEN
1484#ifdef __x86_64__ 1484#ifdef __x86_64__
1485 /* 1485 /*
1486 * We want a dummy page directory for Xen: 1486 * We want a dummy page directory for Xen:
1487 * when deactivate a pmap, Xen will still consider it active. 1487 * when deactivate a pmap, Xen will still consider it active.
1488 * So we set user PGD to this one to lift all protection on 1488 * So we set user PGD to this one to lift all protection on
1489 * the now inactive page tables set. 1489 * the now inactive page tables set.
1490 */ 1490 */
1491 xen_dummy_user_pgd = avail_start; 1491 xen_dummy_user_pgd = avail_start;
1492 avail_start += PAGE_SIZE; 1492 avail_start += PAGE_SIZE;
1493  1493
1494 /* Zero fill it, the less checks in Xen it requires the better */ 1494 /* Zero fill it, the less checks in Xen it requires the better */
1495 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1495 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1496 /* Mark read-only */ 1496 /* Mark read-only */
1497 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1497 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1498 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1498 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1499 /* Pin as L4 */ 1499 /* Pin as L4 */
1500 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1500 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1501#endif /* __x86_64__ */ 1501#endif /* __x86_64__ */
1502 idt_vaddr = virtual_avail; /* don't need pte */ 1502 idt_vaddr = virtual_avail; /* don't need pte */
1503 idt_paddr = avail_start; /* steal a page */ 1503 idt_paddr = avail_start; /* steal a page */
1504 /* 1504 /*
1505 * Xen require one more page as we can't store 1505 * Xen require one more page as we can't store
1506 * GDT and LDT on the same page 1506 * GDT and LDT on the same page
1507 */ 1507 */
1508 virtual_avail += 3 * PAGE_SIZE; 1508 virtual_avail += 3 * PAGE_SIZE;
1509 avail_start += 3 * PAGE_SIZE; 1509 avail_start += 3 * PAGE_SIZE;
1510#else /* XEN */ 1510#else /* XEN */
1511 idt_vaddr = virtual_avail; /* don't need pte */ 1511 idt_vaddr = virtual_avail; /* don't need pte */
1512 idt_paddr = avail_start; /* steal a page */ 1512 idt_paddr = avail_start; /* steal a page */
1513#if defined(__x86_64__) 1513#if defined(__x86_64__)
1514 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1514 virtual_avail += 2 * PAGE_SIZE; pte += 2;
1515 avail_start += 2 * PAGE_SIZE; 1515 avail_start += 2 * PAGE_SIZE;
1516#else /* defined(__x86_64__) */ 1516#else /* defined(__x86_64__) */
1517 virtual_avail += PAGE_SIZE; pte++; 1517 virtual_avail += PAGE_SIZE; pte++;
1518 avail_start += PAGE_SIZE; 1518 avail_start += PAGE_SIZE;
1519 /* pentium f00f bug stuff */ 1519 /* pentium f00f bug stuff */
1520 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1520 pentium_idt_vaddr = virtual_avail; /* don't need pte */
1521 virtual_avail += PAGE_SIZE; pte++; 1521 virtual_avail += PAGE_SIZE; pte++;
1522#endif /* defined(__x86_64__) */ 1522#endif /* defined(__x86_64__) */
1523#endif /* XEN */ 1523#endif /* XEN */
1524 1524
1525#ifdef _LP64 1525#ifdef _LP64
1526 /* 1526 /*
1527 * Grab a page below 4G for things that need it (i.e. 1527 * Grab a page below 4G for things that need it (i.e.
1528 * having an initial %cr3 for the MP trampoline). 1528 * having an initial %cr3 for the MP trampoline).
1529 */ 1529 */
1530 lo32_vaddr = virtual_avail; 1530 lo32_vaddr = virtual_avail;
1531 virtual_avail += PAGE_SIZE; pte++; 1531 virtual_avail += PAGE_SIZE; pte++;
1532 lo32_paddr = avail_start; 1532 lo32_paddr = avail_start;
1533 avail_start += PAGE_SIZE; 1533 avail_start += PAGE_SIZE;
1534#endif 1534#endif
1535 1535
1536 /* 1536 /*
1537 * now we reserve some VM for mapping pages when doing a crash dump 1537 * now we reserve some VM for mapping pages when doing a crash dump
1538 */ 1538 */
1539 1539
1540 virtual_avail = reserve_dumppages(virtual_avail); 1540 virtual_avail = reserve_dumppages(virtual_avail);
1541 1541
1542 /* 1542 /*
1543 * init the static-global locks and global lists. 1543 * init the static-global locks and global lists.
1544 * 1544 *
1545 * => pventry::pvh_lock (initialized elsewhere) must also be 1545 * => pventry::pvh_lock (initialized elsewhere) must also be
1546 * a spin lock, again at IPL_VM to prevent deadlock, and 1546 * a spin lock, again at IPL_VM to prevent deadlock, and
1547 * again is never taken from interrupt context. 1547 * again is never taken from interrupt context.
1548 */ 1548 */
1549 1549
1550 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1550 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1551 LIST_INIT(&pmaps); 1551 LIST_INIT(&pmaps);
1552 pmap_cpu_init_early(curcpu()); 1552 pmap_cpu_init_early(curcpu());
1553 1553
1554 /* 1554 /*
1555 * initialize caches. 1555 * initialize caches.
1556 */ 1556 */
1557 1557
1558 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1558 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1559 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1559 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1560#ifdef PAE 1560#ifdef PAE
1561 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1561 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1562 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1562 "pdppl", &pmap_pdp_allocator, IPL_NONE,
1563 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1563 pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1564#else /* PAE */ 1564#else /* PAE */
1565 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1565 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1566 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1566 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1567#endif /* PAE */ 1567#endif /* PAE */
1568 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1568 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1569 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1569 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1570 NULL, NULL); 1570 NULL, NULL);
1571 1571
1572 /* 1572 /*
1573 * ensure the TLB is sync'd with reality by flushing it... 1573 * ensure the TLB is sync'd with reality by flushing it...
1574 */ 1574 */
1575 1575
1576 tlbflush(); 1576 tlbflush();
1577 1577
1578 /* 1578 /*
1579 * calculate pmap_maxkvaddr from nkptp[]. 1579 * calculate pmap_maxkvaddr from nkptp[].
1580 */ 1580 */
1581 1581
1582 kva = VM_MIN_KERNEL_ADDRESS; 1582 kva = VM_MIN_KERNEL_ADDRESS;
1583 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1583 for (i = PTP_LEVELS - 1; i >= 1; i--) {
1584 kva += nkptp[i] * nbpd[i]; 1584 kva += nkptp[i] * nbpd[i];
1585 } 1585 }
1586 pmap_maxkvaddr = kva; 1586 pmap_maxkvaddr = kva;
1587} 1587}
1588 1588
1589#if defined(__x86_64__) 1589#if defined(__x86_64__)
1590/* 1590/*
1591 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1591 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1592 * trampoline code can be entered. 1592 * trampoline code can be entered.
1593 */ 1593 */
1594void 1594void
1595pmap_prealloc_lowmem_ptps(void) 1595pmap_prealloc_lowmem_ptps(void)
1596{ 1596{
1597#ifdef XEN 1597#ifdef XEN
1598 int level; 1598 int level;
1599 paddr_t newp; 1599 paddr_t newp;
1600 paddr_t pdes_pa; 1600 paddr_t pdes_pa;
1601 1601
1602 pdes_pa = pmap_pdirpa(pmap_kernel(), 0); 1602 pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
1603 level = PTP_LEVELS; 1603 level = PTP_LEVELS;
1604 for (;;) { 1604 for (;;) {
1605 newp = avail_start; 1605 newp = avail_start;
1606 avail_start += PAGE_SIZE; 1606 avail_start += PAGE_SIZE;
1607 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1607 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1608 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1608 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1609 memset((void *)early_zerop, 0, PAGE_SIZE); 1609 memset((void *)early_zerop, 0, PAGE_SIZE);
1610 /* Mark R/O before installing */ 1610 /* Mark R/O before installing */
1611 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1611 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1612 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1612 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1613 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1613 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1614 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1614 HYPERVISOR_update_va_mapping (newp + KERNBASE,
1615 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1615 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1616 xpq_queue_pte_update ( 1616 xpq_queue_pte_update (
1617 xpmap_ptom_masked(pdes_pa) 1617 xpmap_ptom_masked(pdes_pa)
1618 + (pl_i(0, level) * sizeof (pd_entry_t)), 1618 + (pl_i(0, level) * sizeof (pd_entry_t)),
1619 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1619 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1620 level--; 1620 level--;
1621 if (level <= 1) 1621 if (level <= 1)
1622 break; 1622 break;
1623 pdes_pa = newp; 1623 pdes_pa = newp;
1624 } 1624 }
1625#else /* XEN */ 1625#else /* XEN */
1626 pd_entry_t *pdes; 1626 pd_entry_t *pdes;
1627 int level; 1627 int level;
1628 paddr_t newp; 1628 paddr_t newp;
1629 1629
1630 pdes = pmap_kernel()->pm_pdir; 1630 pdes = pmap_kernel()->pm_pdir;
1631 level = PTP_LEVELS; 1631 level = PTP_LEVELS;
1632 for (;;) { 1632 for (;;) {
1633 newp = avail_start; 1633 newp = avail_start;
1634 avail_start += PAGE_SIZE; 1634 avail_start += PAGE_SIZE;
1635 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1635 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1636 pmap_update_pg((vaddr_t)early_zerop); 1636 pmap_update_pg((vaddr_t)early_zerop);
1637 memset(early_zerop, 0, PAGE_SIZE); 1637 memset(early_zerop, 0, PAGE_SIZE);
1638 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1638 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1639 level--; 1639 level--;
1640 if (level <= 1) 1640 if (level <= 1)
1641 break; 1641 break;
1642 pdes = normal_pdes[level - 2]; 1642 pdes = normal_pdes[level - 2];
1643 } 1643 }
1644#endif /* XEN */ 1644#endif /* XEN */
1645} 1645}
1646#endif /* defined(__x86_64__) */ 1646#endif /* defined(__x86_64__) */
1647 1647
1648/* 1648/*
1649 * pmap_init: called from uvm_init, our job is to get the pmap 1649 * pmap_init: called from uvm_init, our job is to get the pmap
1650 * system ready to manage mappings... 1650 * system ready to manage mappings...
1651 */ 1651 */
1652 1652
1653void 1653void
1654pmap_init(void) 1654pmap_init(void)
1655{ 1655{
1656 int i; 1656 int i;
1657 1657
1658 for (i = 0; i < PV_HASH_SIZE; i++) { 1658 for (i = 0; i < PV_HASH_SIZE; i++) {
1659 SLIST_INIT(&pv_hash_heads[i].hh_list); 1659 SLIST_INIT(&pv_hash_heads[i].hh_list);
1660 } 1660 }
1661 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1661 for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1662 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1662 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1663 } 1663 }
1664 1664
1665 /* 1665 /*
1666 * done: pmap module is up (and ready for business) 1666 * done: pmap module is up (and ready for business)
1667 */ 1667 */
1668 1668
1669 pmap_initialized = true; 1669 pmap_initialized = true;
1670} 1670}
1671 1671
1672/* 1672/*
1673 * pmap_cpu_init_early: perform early per-CPU initialization. 1673 * pmap_cpu_init_early: perform early per-CPU initialization.
1674 */ 1674 */
1675 1675
1676void 1676void
1677pmap_cpu_init_early(struct cpu_info *ci) 1677pmap_cpu_init_early(struct cpu_info *ci)
1678{ 1678{
1679 struct pmap_cpu *pc; 1679 struct pmap_cpu *pc;
1680 static uint8_t pmap_cpu_alloc; 1680 static uint8_t pmap_cpu_alloc;
1681 1681
1682 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1682 pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1683 ci->ci_pmap_cpu = pc; 1683 ci->ci_pmap_cpu = pc;
1684} 1684}
1685 1685
1686/* 1686/*
1687 * pmap_cpu_init_late: perform late per-CPU initialization. 1687 * pmap_cpu_init_late: perform late per-CPU initialization.
1688 */ 1688 */
1689 1689
1690void 1690void
1691pmap_cpu_init_late(struct cpu_info *ci) 1691pmap_cpu_init_late(struct cpu_info *ci)
1692{ 1692{
1693 1693
1694 if (ci == &cpu_info_primary) { 1694 if (ci == &cpu_info_primary) {
1695 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1695 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1696 NULL, "global", "TLB IPI"); 1696 NULL, "global", "TLB IPI");
1697 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1697 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1698 NULL, "x86", "io bitmap copy"); 1698 NULL, "x86", "io bitmap copy");
1699 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1699 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1700 NULL, "x86", "ldt sync"); 1700 NULL, "x86", "ldt sync");
1701 } 1701 }
1702 1702
1703 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1703 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1704 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1704 NULL, device_xname(ci->ci_dev), "TLB IPI");
1705 1705
1706#ifdef PAE 1706#ifdef PAE
1707 int ret; 1707 int ret;
1708 struct pglist pg; 1708 struct pglist pg;
1709 struct vm_page *vmap; 1709 struct vm_page *vmap;
1710 1710
1711 /* The BP has already its own L3 page allocated in locore.S. */ 1711 /* The BP has already its own L3 page allocated in locore.S. */
1712 if (ci == &cpu_info_primary) 1712 if (ci == &cpu_info_primary)
1713 return; 1713 return;
1714 1714
1715 /* 1715 /*
1716 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts 1716 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1717 * resides below the 4GB boundary. 1717 * resides below the 4GB boundary.
1718 */ 1718 */
1719 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0); 1719 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1720 vmap = TAILQ_FIRST(&pg); 1720 vmap = TAILQ_FIRST(&pg);
1721 1721
1722 if (ret != 0 || vmap == NULL) 1722 if (ret != 0 || vmap == NULL)
1723 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n", 1723 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1724 __func__, cpu_index(ci), ret); 1724 __func__, cpu_index(ci), ret);
1725 1725
1726 ci->ci_pae_l3_pdirpa = vmap->phys_addr; 1726 ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1727 1727
1728 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 1728 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1729 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 1729 UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1730 if (ci->ci_pae_l3_pdir == NULL) 1730 if (ci->ci_pae_l3_pdir == NULL)
1731 panic("%s: failed to allocate L3 PD for CPU %d\n", 1731 panic("%s: failed to allocate L3 PD for CPU %d\n",
1732 __func__, cpu_index(ci)); 1732 __func__, cpu_index(ci));
1733 1733
1734 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa, 1734 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1735 VM_PROT_READ | VM_PROT_WRITE, 0); 1735 VM_PROT_READ | VM_PROT_WRITE, 0);
1736 1736
1737 pmap_update(pmap_kernel()); 1737 pmap_update(pmap_kernel());
1738#endif 1738#endif
1739} 1739}
1740 1740
1741/* 1741/*
1742 * p v _ e n t r y f u n c t i o n s 1742 * p v _ e n t r y f u n c t i o n s
1743 */ 1743 */
1744 1744
1745/* 1745/*
1746 * pmap_free_pvs: free a list of pv_entrys 1746 * pmap_free_pvs: free a list of pv_entrys
1747 */ 1747 */
1748 1748
1749static void 1749static void
1750pmap_free_pvs(struct pv_entry *pve) 1750pmap_free_pvs(struct pv_entry *pve)
1751{ 1751{
1752 struct pv_entry *next; 1752 struct pv_entry *next;
1753 1753
1754 for ( /* null */ ; pve != NULL ; pve = next) { 1754 for ( /* null */ ; pve != NULL ; pve = next) {
1755 next = pve->pve_next; 1755 next = pve->pve_next;
1756 pool_cache_put(&pmap_pv_cache, pve); 1756 pool_cache_put(&pmap_pv_cache, pve);
1757 } 1757 }
1758} 1758}
1759 1759
1760/* 1760/*
1761 * main pv_entry manipulation functions: 1761 * main pv_entry manipulation functions:
1762 * pmap_enter_pv: enter a mapping onto a pv_head list 1762 * pmap_enter_pv: enter a mapping onto a pv_head list
1763 * pmap_remove_pv: remove a mapping from a pv_head list 1763 * pmap_remove_pv: remove a mapping from a pv_head list
1764 * 1764 *
1765 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock  1765 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1766 * the pvh before calling 1766 * the pvh before calling
1767 */ 1767 */
1768 1768
1769/* 1769/*