Wed Sep 24 09:33:41 2008 UTC ()
PR kern/39307 mfs will sometimes panic at umount time

In vfs_destroy, assert that the refcount is not dropping below zero.


(ad)
diff -r1.356 -r1.357 src/sys/kern/vfs_subr.c

cvs diff -r1.356 -r1.357 src/sys/kern/vfs_subr.c (switch to unified diff)

--- src/sys/kern/vfs_subr.c 2008/09/07 13:09:36 1.356
+++ src/sys/kern/vfs_subr.c 2008/09/24 09:33:40 1.357
@@ -1,1291 +1,1292 @@ @@ -1,1291 +1,1292 @@
1/* $NetBSD: vfs_subr.c,v 1.356 2008/09/07 13:09:36 tron Exp $ */ 1/* $NetBSD: vfs_subr.c,v 1.357 2008/09/24 09:33:40 ad Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer. 15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright 16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the 17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution. 18 * documentation and/or other materials provided with the distribution.
19 * 19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE. 30 * POSSIBILITY OF SUCH DAMAGE.
31 */ 31 */
32 32
33/* 33/*
34 * Copyright (c) 1989, 1993 34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved. 35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc. 36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed 37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph 38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc. 40 * the permission of UNIX System Laboratories, Inc.
41 * 41 *
42 * Redistribution and use in source and binary forms, with or without 42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions 43 * modification, are permitted provided that the following conditions
44 * are met: 44 * are met:
45 * 1. Redistributions of source code must retain the above copyright 45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer. 46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright 47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the 48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution. 49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors 50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software 51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission. 52 * without specific prior written permission.
53 * 53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE. 64 * SUCH DAMAGE.
65 * 65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */ 67 */
68 68
69/* 69/*
70 * Note on v_usecount and locking: 70 * Note on v_usecount and locking:
71 * 71 *
72 * At nearly all points it is known that v_usecount could be zero, the 72 * At nearly all points it is known that v_usecount could be zero, the
73 * vnode interlock will be held. 73 * vnode interlock will be held.
74 * 74 *
75 * To change v_usecount away from zero, the interlock must be held. To 75 * To change v_usecount away from zero, the interlock must be held. To
76 * change from a non-zero value to zero, again the interlock must be 76 * change from a non-zero value to zero, again the interlock must be
77 * held. 77 * held.
78 * 78 *
79 * Changing the usecount from a non-zero value to a non-zero value can 79 * Changing the usecount from a non-zero value to a non-zero value can
80 * safely be done using atomic operations, without the interlock held. 80 * safely be done using atomic operations, without the interlock held.
81 */ 81 */
82 82
83#include <sys/cdefs.h> 83#include <sys/cdefs.h>
84__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.356 2008/09/07 13:09:36 tron Exp $"); 84__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.357 2008/09/24 09:33:40 ad Exp $");
85 85
86#include "opt_ddb.h" 86#include "opt_ddb.h"
87#include "opt_compat_netbsd.h" 87#include "opt_compat_netbsd.h"
88#include "opt_compat_43.h" 88#include "opt_compat_43.h"
89 89
90#include <sys/param.h> 90#include <sys/param.h>
91#include <sys/systm.h> 91#include <sys/systm.h>
92#include <sys/proc.h> 92#include <sys/proc.h>
93#include <sys/kernel.h> 93#include <sys/kernel.h>
94#include <sys/mount.h> 94#include <sys/mount.h>
95#include <sys/fcntl.h> 95#include <sys/fcntl.h>
96#include <sys/vnode.h> 96#include <sys/vnode.h>
97#include <sys/stat.h> 97#include <sys/stat.h>
98#include <sys/namei.h> 98#include <sys/namei.h>
99#include <sys/ucred.h> 99#include <sys/ucred.h>
100#include <sys/buf.h> 100#include <sys/buf.h>
101#include <sys/errno.h> 101#include <sys/errno.h>
102#include <sys/malloc.h> 102#include <sys/malloc.h>
103#include <sys/syscallargs.h> 103#include <sys/syscallargs.h>
104#include <sys/device.h> 104#include <sys/device.h>
105#include <sys/filedesc.h> 105#include <sys/filedesc.h>
106#include <sys/kauth.h> 106#include <sys/kauth.h>
107#include <sys/atomic.h> 107#include <sys/atomic.h>
108#include <sys/kthread.h> 108#include <sys/kthread.h>
109#include <sys/wapbl.h> 109#include <sys/wapbl.h>
110 110
111#include <miscfs/specfs/specdev.h> 111#include <miscfs/specfs/specdev.h>
112#include <miscfs/syncfs/syncfs.h> 112#include <miscfs/syncfs/syncfs.h>
113 113
114#include <uvm/uvm.h> 114#include <uvm/uvm.h>
115#include <uvm/uvm_readahead.h> 115#include <uvm/uvm_readahead.h>
116#include <uvm/uvm_ddb.h> 116#include <uvm/uvm_ddb.h>
117 117
118#include <sys/sysctl.h> 118#include <sys/sysctl.h>
119 119
120const enum vtype iftovt_tab[16] = { 120const enum vtype iftovt_tab[16] = {
121 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 121 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
122 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 122 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
123}; 123};
124const int vttoif_tab[9] = { 124const int vttoif_tab[9] = {
125 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 125 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
126 S_IFSOCK, S_IFIFO, S_IFMT, 126 S_IFSOCK, S_IFIFO, S_IFMT,
127}; 127};
128 128
129/* 129/*
130 * Insq/Remq for the vnode usage lists. 130 * Insq/Remq for the vnode usage lists.
131 */ 131 */
132#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 132#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
133#define bufremvn(bp) { \ 133#define bufremvn(bp) { \
134 LIST_REMOVE(bp, b_vnbufs); \ 134 LIST_REMOVE(bp, b_vnbufs); \
135 (bp)->b_vnbufs.le_next = NOLIST; \ 135 (bp)->b_vnbufs.le_next = NOLIST; \
136} 136}
137 137
138int doforce = 1; /* 1 => permit forcible unmounting */ 138int doforce = 1; /* 1 => permit forcible unmounting */
139int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 139int prtactive = 0; /* 1 => print out reclaim of active vnodes */
140 140
141extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 141extern int dovfsusermount; /* 1 => permit any user to mount filesystems */
142extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 142extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */
143 143
144static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 144static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
145static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 145static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
146static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 146static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
147 147
148struct mntlist mountlist = /* mounted filesystem list */ 148struct mntlist mountlist = /* mounted filesystem list */
149 CIRCLEQ_HEAD_INITIALIZER(mountlist); 149 CIRCLEQ_HEAD_INITIALIZER(mountlist);
150 150
151u_int numvnodes; 151u_int numvnodes;
152static specificdata_domain_t mount_specificdata_domain; 152static specificdata_domain_t mount_specificdata_domain;
153 153
154static int vrele_pending; 154static int vrele_pending;
155static int vrele_gen; 155static int vrele_gen;
156static kmutex_t vrele_lock; 156static kmutex_t vrele_lock;
157static kcondvar_t vrele_cv; 157static kcondvar_t vrele_cv;
158static lwp_t *vrele_lwp; 158static lwp_t *vrele_lwp;
159 159
160kmutex_t mountlist_lock; 160kmutex_t mountlist_lock;
161kmutex_t mntid_lock; 161kmutex_t mntid_lock;
162kmutex_t mntvnode_lock; 162kmutex_t mntvnode_lock;
163kmutex_t vnode_free_list_lock; 163kmutex_t vnode_free_list_lock;
164kmutex_t vfs_list_lock; 164kmutex_t vfs_list_lock;
165 165
166static pool_cache_t vnode_cache; 166static pool_cache_t vnode_cache;
167 167
168MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 168MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
169 169
170/* 170/*
171 * These define the root filesystem and device. 171 * These define the root filesystem and device.
172 */ 172 */
173struct vnode *rootvnode; 173struct vnode *rootvnode;
174struct device *root_device; /* root device */ 174struct device *root_device; /* root device */
175 175
176/* 176/*
177 * Local declarations. 177 * Local declarations.
178 */ 178 */
179 179
180static void vrele_thread(void *); 180static void vrele_thread(void *);
181static void insmntque(vnode_t *, struct mount *); 181static void insmntque(vnode_t *, struct mount *);
182static int getdevvp(dev_t, vnode_t **, enum vtype); 182static int getdevvp(dev_t, vnode_t **, enum vtype);
183static vnode_t *getcleanvnode(void);; 183static vnode_t *getcleanvnode(void);;
184void vpanic(vnode_t *, const char *); 184void vpanic(vnode_t *, const char *);
185 185
186#ifdef DEBUG  186#ifdef DEBUG
187void printlockedvnodes(void); 187void printlockedvnodes(void);
188#endif 188#endif
189 189
190#ifdef DIAGNOSTIC 190#ifdef DIAGNOSTIC
191void 191void
192vpanic(vnode_t *vp, const char *msg) 192vpanic(vnode_t *vp, const char *msg)
193{ 193{
194 194
195 vprint(NULL, vp); 195 vprint(NULL, vp);
196 panic("%s\n", msg); 196 panic("%s\n", msg);
197} 197}
198#else 198#else
199#define vpanic(vp, msg) /* nothing */ 199#define vpanic(vp, msg) /* nothing */
200#endif 200#endif
201 201
202void 202void
203vn_init1(void) 203vn_init1(void)
204{ 204{
205 205
206 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 206 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
207 NULL, IPL_NONE, NULL, NULL, NULL); 207 NULL, IPL_NONE, NULL, NULL, NULL);
208 KASSERT(vnode_cache != NULL); 208 KASSERT(vnode_cache != NULL);
209 209
210 /* Create deferred release thread. */ 210 /* Create deferred release thread. */
211 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 211 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
212 cv_init(&vrele_cv, "vrele"); 212 cv_init(&vrele_cv, "vrele");
213 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 213 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
214 NULL, &vrele_lwp, "vrele")) 214 NULL, &vrele_lwp, "vrele"))
215 panic("fork vrele"); 215 panic("fork vrele");
216} 216}
217 217
218/* 218/*
219 * Initialize the vnode management data structures. 219 * Initialize the vnode management data structures.
220 */ 220 */
221void 221void
222vntblinit(void) 222vntblinit(void)
223{ 223{
224 224
225 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE); 225 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
226 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE); 226 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
227 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE); 227 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
228 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 228 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
229 mutex_init(&specfs_lock, MUTEX_DEFAULT, IPL_NONE); 229 mutex_init(&specfs_lock, MUTEX_DEFAULT, IPL_NONE);
230 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE); 230 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
231 231
232 mount_specificdata_domain = specificdata_domain_create(); 232 mount_specificdata_domain = specificdata_domain_create();
233 233
234 /* Initialize the filesystem syncer. */ 234 /* Initialize the filesystem syncer. */
235 vn_initialize_syncerd(); 235 vn_initialize_syncerd();
236 vn_init1(); 236 vn_init1();
237} 237}
238 238
239int 239int
240vfs_drainvnodes(long target, struct lwp *l) 240vfs_drainvnodes(long target, struct lwp *l)
241{ 241{
242 242
243 while (numvnodes > target) { 243 while (numvnodes > target) {
244 vnode_t *vp; 244 vnode_t *vp;
245 245
246 mutex_enter(&vnode_free_list_lock); 246 mutex_enter(&vnode_free_list_lock);
247 vp = getcleanvnode(); 247 vp = getcleanvnode();
248 if (vp == NULL) 248 if (vp == NULL)
249 return EBUSY; /* give up */ 249 return EBUSY; /* give up */
250 ungetnewvnode(vp); 250 ungetnewvnode(vp);
251 } 251 }
252 252
253 return 0; 253 return 0;
254} 254}
255 255
256/* 256/*
257 * Lookup a mount point by filesystem identifier. 257 * Lookup a mount point by filesystem identifier.
258 * 258 *
259 * XXX Needs to add a reference to the mount point. 259 * XXX Needs to add a reference to the mount point.
260 */ 260 */
261struct mount * 261struct mount *
262vfs_getvfs(fsid_t *fsid) 262vfs_getvfs(fsid_t *fsid)
263{ 263{
264 struct mount *mp; 264 struct mount *mp;
265 265
266 mutex_enter(&mountlist_lock); 266 mutex_enter(&mountlist_lock);
267 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 267 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
268 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 268 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
269 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 269 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
270 mutex_exit(&mountlist_lock); 270 mutex_exit(&mountlist_lock);
271 return (mp); 271 return (mp);
272 } 272 }
273 } 273 }
274 mutex_exit(&mountlist_lock); 274 mutex_exit(&mountlist_lock);
275 return ((struct mount *)0); 275 return ((struct mount *)0);
276} 276}
277 277
278/* 278/*
279 * Drop a reference to a mount structure, freeing if the last reference. 279 * Drop a reference to a mount structure, freeing if the last reference.
280 */ 280 */
281void 281void
282vfs_destroy(struct mount *mp) 282vfs_destroy(struct mount *mp)
283{ 283{
284 284
285 if (__predict_true(atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) { 285 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
286 return; 286 return;
287 } 287 }
288 288
289 /* 289 /*
290 * Nothing else has visibility of the mount: we can now 290 * Nothing else has visibility of the mount: we can now
291 * free the data structures. 291 * free the data structures.
292 */ 292 */
 293 KASSERT(mp->mnt_refcnt == 0);
293 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 294 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
294 rw_destroy(&mp->mnt_unmounting); 295 rw_destroy(&mp->mnt_unmounting);
295 mutex_destroy(&mp->mnt_updating); 296 mutex_destroy(&mp->mnt_updating);
296 mutex_destroy(&mp->mnt_renamelock); 297 mutex_destroy(&mp->mnt_renamelock);
297 if (mp->mnt_op != NULL) { 298 if (mp->mnt_op != NULL) {
298 vfs_delref(mp->mnt_op); 299 vfs_delref(mp->mnt_op);
299 } 300 }
300 kmem_free(mp, sizeof(*mp)); 301 kmem_free(mp, sizeof(*mp));
301} 302}
302 303
303/* 304/*
304 * grab a vnode from freelist and clean it. 305 * grab a vnode from freelist and clean it.
305 */ 306 */
306vnode_t * 307vnode_t *
307getcleanvnode(void) 308getcleanvnode(void)
308{ 309{
309 vnode_t *vp; 310 vnode_t *vp;
310 vnodelst_t *listhd; 311 vnodelst_t *listhd;
311 312
312 KASSERT(mutex_owned(&vnode_free_list_lock)); 313 KASSERT(mutex_owned(&vnode_free_list_lock));
313 314
314retry: 315retry:
315 listhd = &vnode_free_list; 316 listhd = &vnode_free_list;
316try_nextlist: 317try_nextlist:
317 TAILQ_FOREACH(vp, listhd, v_freelist) { 318 TAILQ_FOREACH(vp, listhd, v_freelist) {
318 /* 319 /*
319 * It's safe to test v_usecount and v_iflag 320 * It's safe to test v_usecount and v_iflag
320 * without holding the interlock here, since 321 * without holding the interlock here, since
321 * these vnodes should never appear on the 322 * these vnodes should never appear on the
322 * lists. 323 * lists.
323 */ 324 */
324 if (vp->v_usecount != 0) { 325 if (vp->v_usecount != 0) {
325 vpanic(vp, "free vnode isn't"); 326 vpanic(vp, "free vnode isn't");
326 } 327 }
327 if ((vp->v_iflag & VI_CLEAN) != 0) { 328 if ((vp->v_iflag & VI_CLEAN) != 0) {
328 vpanic(vp, "clean vnode on freelist"); 329 vpanic(vp, "clean vnode on freelist");
329 } 330 }
330 if (vp->v_freelisthd != listhd) { 331 if (vp->v_freelisthd != listhd) {
331 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 332 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
332 vpanic(vp, "list head mismatch"); 333 vpanic(vp, "list head mismatch");
333 } 334 }
334 if (!mutex_tryenter(&vp->v_interlock)) 335 if (!mutex_tryenter(&vp->v_interlock))
335 continue; 336 continue;
336 /* 337 /*
337 * Our lwp might hold the underlying vnode 338 * Our lwp might hold the underlying vnode
338 * locked, so don't try to reclaim a VI_LAYER 339 * locked, so don't try to reclaim a VI_LAYER
339 * node if it's locked. 340 * node if it's locked.
340 */ 341 */
341 if ((vp->v_iflag & VI_XLOCK) == 0 && 342 if ((vp->v_iflag & VI_XLOCK) == 0 &&
342 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 343 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
343 break; 344 break;
344 } 345 }
345 mutex_exit(&vp->v_interlock); 346 mutex_exit(&vp->v_interlock);
346 } 347 }
347 348
348 if (vp == NULL) { 349 if (vp == NULL) {
349 if (listhd == &vnode_free_list) { 350 if (listhd == &vnode_free_list) {
350 listhd = &vnode_hold_list; 351 listhd = &vnode_hold_list;
351 goto try_nextlist; 352 goto try_nextlist;
352 } 353 }
353 mutex_exit(&vnode_free_list_lock); 354 mutex_exit(&vnode_free_list_lock);
354 return NULL; 355 return NULL;
355 } 356 }
356 357
357 /* Remove it from the freelist. */ 358 /* Remove it from the freelist. */
358 TAILQ_REMOVE(listhd, vp, v_freelist); 359 TAILQ_REMOVE(listhd, vp, v_freelist);
359 vp->v_freelisthd = NULL; 360 vp->v_freelisthd = NULL;
360 mutex_exit(&vnode_free_list_lock); 361 mutex_exit(&vnode_free_list_lock);
361 362
362 /* 363 /*
363 * The vnode is still associated with a file system, so we must 364 * The vnode is still associated with a file system, so we must
364 * clean it out before reusing it. We need to add a reference 365 * clean it out before reusing it. We need to add a reference
365 * before doing this. If the vnode gains another reference while 366 * before doing this. If the vnode gains another reference while
366 * being cleaned out then we lose - retry. 367 * being cleaned out then we lose - retry.
367 */ 368 */
368 atomic_inc_uint(&vp->v_usecount); 369 atomic_inc_uint(&vp->v_usecount);
369 vclean(vp, DOCLOSE); 370 vclean(vp, DOCLOSE);
370 if (vp->v_usecount == 1) { 371 if (vp->v_usecount == 1) {
371 /* We're about to dirty it. */ 372 /* We're about to dirty it. */
372 vp->v_iflag &= ~VI_CLEAN; 373 vp->v_iflag &= ~VI_CLEAN;
373 mutex_exit(&vp->v_interlock); 374 mutex_exit(&vp->v_interlock);
374 if (vp->v_type == VBLK || vp->v_type == VCHR) { 375 if (vp->v_type == VBLK || vp->v_type == VCHR) {
375 spec_node_destroy(vp); 376 spec_node_destroy(vp);
376 } 377 }
377 vp->v_type = VNON; 378 vp->v_type = VNON;
378 } else { 379 } else {
379 /* 380 /*
380 * Don't return to freelist - the holder of the last 381 * Don't return to freelist - the holder of the last
381 * reference will destroy it. 382 * reference will destroy it.
382 */ 383 */
383 vrelel(vp, 0); /* releases vp->v_interlock */ 384 vrelel(vp, 0); /* releases vp->v_interlock */
384 mutex_enter(&vnode_free_list_lock); 385 mutex_enter(&vnode_free_list_lock);
385 goto retry; 386 goto retry;
386 } 387 }
387 388
388 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 389 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
389 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 390 !TAILQ_EMPTY(&vp->v_uobj.memq)) {
390 vpanic(vp, "cleaned vnode isn't"); 391 vpanic(vp, "cleaned vnode isn't");
391 } 392 }
392 if (vp->v_numoutput != 0) { 393 if (vp->v_numoutput != 0) {
393 vpanic(vp, "clean vnode has pending I/O's"); 394 vpanic(vp, "clean vnode has pending I/O's");
394 } 395 }
395 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 396 if ((vp->v_iflag & VI_ONWORKLST) != 0) {
396 vpanic(vp, "clean vnode on syncer list"); 397 vpanic(vp, "clean vnode on syncer list");
397 } 398 }
398 399
399 return vp; 400 return vp;
400} 401}
401 402
402/* 403/*
403 * Mark a mount point as busy, and gain a new reference to it. Used to 404 * Mark a mount point as busy, and gain a new reference to it. Used to
404 * prevent the file system from being unmounted during critical sections. 405 * prevent the file system from being unmounted during critical sections.
405 * 406 *
406 * => The caller must hold a pre-existing reference to the mount. 407 * => The caller must hold a pre-existing reference to the mount.
407 * => Will fail if the file system is being unmounted, or is unmounted. 408 * => Will fail if the file system is being unmounted, or is unmounted.
408 */ 409 */
409int 410int
410vfs_busy(struct mount *mp, struct mount **nextp) 411vfs_busy(struct mount *mp, struct mount **nextp)
411{ 412{
412 413
413 KASSERT(mp->mnt_refcnt > 0); 414 KASSERT(mp->mnt_refcnt > 0);
414 415
415 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) { 416 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
416 if (nextp != NULL) { 417 if (nextp != NULL) {
417 KASSERT(mutex_owned(&mountlist_lock)); 418 KASSERT(mutex_owned(&mountlist_lock));
418 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 419 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
419 } 420 }
420 return EBUSY; 421 return EBUSY;
421 } 422 }
422 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 423 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
423 rw_exit(&mp->mnt_unmounting); 424 rw_exit(&mp->mnt_unmounting);
424 if (nextp != NULL) { 425 if (nextp != NULL) {
425 KASSERT(mutex_owned(&mountlist_lock)); 426 KASSERT(mutex_owned(&mountlist_lock));
426 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 427 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
427 } 428 }
428 return ENOENT; 429 return ENOENT;
429 } 430 }
430 if (nextp != NULL) { 431 if (nextp != NULL) {
431 mutex_exit(&mountlist_lock); 432 mutex_exit(&mountlist_lock);
432 } 433 }
433 atomic_inc_uint(&mp->mnt_refcnt); 434 atomic_inc_uint(&mp->mnt_refcnt);
434 return 0; 435 return 0;
435} 436}
436 437
437/* 438/*
438 * Unbusy a busy filesystem. 439 * Unbusy a busy filesystem.
439 * 440 *
440 * => If keepref is true, preserve reference added by vfs_busy(). 441 * => If keepref is true, preserve reference added by vfs_busy().
441 * => If nextp != NULL, acquire mountlist_lock. 442 * => If nextp != NULL, acquire mountlist_lock.
442 */ 443 */
443void 444void
444vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 445vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
445{ 446{
446 447
447 KASSERT(mp->mnt_refcnt > 0); 448 KASSERT(mp->mnt_refcnt > 0);
448 449
449 if (nextp != NULL) { 450 if (nextp != NULL) {
450 mutex_enter(&mountlist_lock); 451 mutex_enter(&mountlist_lock);
451 } 452 }
452 rw_exit(&mp->mnt_unmounting); 453 rw_exit(&mp->mnt_unmounting);
453 if (!keepref) { 454 if (!keepref) {
454 vfs_destroy(mp); 455 vfs_destroy(mp);
455 } 456 }
456 if (nextp != NULL) { 457 if (nextp != NULL) {
457 KASSERT(mutex_owned(&mountlist_lock)); 458 KASSERT(mutex_owned(&mountlist_lock));
458 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 459 *nextp = CIRCLEQ_NEXT(mp, mnt_list);
459 } 460 }
460} 461}
461 462
462/* 463/*
463 * Lookup a filesystem type, and if found allocate and initialize 464 * Lookup a filesystem type, and if found allocate and initialize
464 * a mount structure for it. 465 * a mount structure for it.
465 * 466 *
466 * Devname is usually updated by mount(8) after booting. 467 * Devname is usually updated by mount(8) after booting.
467 */ 468 */
468int 469int
469vfs_rootmountalloc(const char *fstypename, const char *devname, 470vfs_rootmountalloc(const char *fstypename, const char *devname,
470 struct mount **mpp) 471 struct mount **mpp)
471{ 472{
472 struct vfsops *vfsp = NULL; 473 struct vfsops *vfsp = NULL;
473 struct mount *mp; 474 struct mount *mp;
474 475
475 mutex_enter(&vfs_list_lock); 476 mutex_enter(&vfs_list_lock);
476 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 477 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
477 if (!strncmp(vfsp->vfs_name, fstypename,  478 if (!strncmp(vfsp->vfs_name, fstypename,
478 sizeof(mp->mnt_stat.f_fstypename))) 479 sizeof(mp->mnt_stat.f_fstypename)))
479 break; 480 break;
480 if (vfsp == NULL) { 481 if (vfsp == NULL) {
481 mutex_exit(&vfs_list_lock); 482 mutex_exit(&vfs_list_lock);
482 return (ENODEV); 483 return (ENODEV);
483 } 484 }
484 vfsp->vfs_refcount++; 485 vfsp->vfs_refcount++;
485 mutex_exit(&vfs_list_lock); 486 mutex_exit(&vfs_list_lock);
486 487
487 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 488 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
488 if (mp == NULL) 489 if (mp == NULL)
489 return ENOMEM; 490 return ENOMEM;
490 mp->mnt_refcnt = 1; 491 mp->mnt_refcnt = 1;
491 rw_init(&mp->mnt_unmounting); 492 rw_init(&mp->mnt_unmounting);
492 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE); 493 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
493 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 494 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
494 (void)vfs_busy(mp, NULL); 495 (void)vfs_busy(mp, NULL);
495 TAILQ_INIT(&mp->mnt_vnodelist); 496 TAILQ_INIT(&mp->mnt_vnodelist);
496 mp->mnt_op = vfsp; 497 mp->mnt_op = vfsp;
497 mp->mnt_flag = MNT_RDONLY; 498 mp->mnt_flag = MNT_RDONLY;
498 mp->mnt_vnodecovered = NULL; 499 mp->mnt_vnodecovered = NULL;
499 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 500 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
500 sizeof(mp->mnt_stat.f_fstypename)); 501 sizeof(mp->mnt_stat.f_fstypename));
501 mp->mnt_stat.f_mntonname[0] = '/'; 502 mp->mnt_stat.f_mntonname[0] = '/';
502 mp->mnt_stat.f_mntonname[1] = '\0'; 503 mp->mnt_stat.f_mntonname[1] = '\0';
503 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 504 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
504 '\0'; 505 '\0';
505 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 506 (void)copystr(devname, mp->mnt_stat.f_mntfromname,
506 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 507 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
507 mount_initspecific(mp); 508 mount_initspecific(mp);
508 *mpp = mp; 509 *mpp = mp;
509 return (0); 510 return (0);
510} 511}
511 512
512/* 513/*
513 * Routines having to do with the management of the vnode table. 514 * Routines having to do with the management of the vnode table.
514 */ 515 */
515extern int (**dead_vnodeop_p)(void *); 516extern int (**dead_vnodeop_p)(void *);
516 517
517/* 518/*
518 * Return the next vnode from the free list. 519 * Return the next vnode from the free list.
519 */ 520 */
520int 521int
521getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 522getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
522 vnode_t **vpp) 523 vnode_t **vpp)
523{ 524{
524 struct uvm_object *uobj; 525 struct uvm_object *uobj;
525 static int toggle; 526 static int toggle;
526 vnode_t *vp; 527 vnode_t *vp;
527 int error = 0, tryalloc; 528 int error = 0, tryalloc;
528 529
529 try_again: 530 try_again:
530 if (mp != NULL) { 531 if (mp != NULL) {
531 /* 532 /*
532 * Mark filesystem busy while we're creating a 533 * Mark filesystem busy while we're creating a
533 * vnode. If unmount is in progress, this will 534 * vnode. If unmount is in progress, this will
534 * fail. 535 * fail.
535 */ 536 */
536 error = vfs_busy(mp, NULL); 537 error = vfs_busy(mp, NULL);
537 if (error) 538 if (error)
538 return error; 539 return error;
539 } 540 }
540 541
541 /* 542 /*
542 * We must choose whether to allocate a new vnode or recycle an 543 * We must choose whether to allocate a new vnode or recycle an
543 * existing one. The criterion for allocating a new one is that 544 * existing one. The criterion for allocating a new one is that
544 * the total number of vnodes is less than the number desired or 545 * the total number of vnodes is less than the number desired or
545 * there are no vnodes on either free list. Generally we only 546 * there are no vnodes on either free list. Generally we only
546 * want to recycle vnodes that have no buffers associated with 547 * want to recycle vnodes that have no buffers associated with
547 * them, so we look first on the vnode_free_list. If it is empty, 548 * them, so we look first on the vnode_free_list. If it is empty,
548 * we next consider vnodes with referencing buffers on the 549 * we next consider vnodes with referencing buffers on the
549 * vnode_hold_list. The toggle ensures that half the time we 550 * vnode_hold_list. The toggle ensures that half the time we
550 * will use a buffer from the vnode_hold_list, and half the time 551 * will use a buffer from the vnode_hold_list, and half the time
551 * we will allocate a new one unless the list has grown to twice 552 * we will allocate a new one unless the list has grown to twice
552 * the desired size. We are reticent to recycle vnodes from the 553 * the desired size. We are reticent to recycle vnodes from the
553 * vnode_hold_list because we will lose the identity of all its 554 * vnode_hold_list because we will lose the identity of all its
554 * referencing buffers. 555 * referencing buffers.
555 */ 556 */
556 557
557 vp = NULL; 558 vp = NULL;
558 559
559 mutex_enter(&vnode_free_list_lock); 560 mutex_enter(&vnode_free_list_lock);
560 561
561 toggle ^= 1; 562 toggle ^= 1;
562 if (numvnodes > 2 * desiredvnodes) 563 if (numvnodes > 2 * desiredvnodes)
563 toggle = 0; 564 toggle = 0;
564 565
565 tryalloc = numvnodes < desiredvnodes || 566 tryalloc = numvnodes < desiredvnodes ||
566 (TAILQ_FIRST(&vnode_free_list) == NULL && 567 (TAILQ_FIRST(&vnode_free_list) == NULL &&
567 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 568 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
568 569
569 if (tryalloc) { 570 if (tryalloc) {
570 numvnodes++; 571 numvnodes++;
571 mutex_exit(&vnode_free_list_lock); 572 mutex_exit(&vnode_free_list_lock);
572 if ((vp = vnalloc(NULL)) == NULL) { 573 if ((vp = vnalloc(NULL)) == NULL) {
573 mutex_enter(&vnode_free_list_lock); 574 mutex_enter(&vnode_free_list_lock);
574 numvnodes--; 575 numvnodes--;
575 } else 576 } else
576 vp->v_usecount = 1; 577 vp->v_usecount = 1;
577 } 578 }
578 579
579 if (vp == NULL) { 580 if (vp == NULL) {
580 vp = getcleanvnode(); 581 vp = getcleanvnode();
581 if (vp == NULL) { 582 if (vp == NULL) {
582 if (mp != NULL) { 583 if (mp != NULL) {
583 vfs_unbusy(mp, false, NULL); 584 vfs_unbusy(mp, false, NULL);
584 } 585 }
585 if (tryalloc) { 586 if (tryalloc) {
586 printf("WARNING: unable to allocate new " 587 printf("WARNING: unable to allocate new "
587 "vnode, retrying...\n"); 588 "vnode, retrying...\n");
588 kpause("newvn", false, hz, NULL); 589 kpause("newvn", false, hz, NULL);
589 goto try_again; 590 goto try_again;
590 } 591 }
591 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 592 tablefull("vnode", "increase kern.maxvnodes or NVNODE");
592 *vpp = 0; 593 *vpp = 0;
593 return (ENFILE); 594 return (ENFILE);
594 } 595 }
595 vp->v_iflag = 0; 596 vp->v_iflag = 0;
596 vp->v_vflag = 0; 597 vp->v_vflag = 0;
597 vp->v_uflag = 0; 598 vp->v_uflag = 0;
598 vp->v_socket = NULL; 599 vp->v_socket = NULL;
599 } 600 }
600 601
601 KASSERT(vp->v_usecount == 1); 602 KASSERT(vp->v_usecount == 1);
602 KASSERT(vp->v_freelisthd == NULL); 603 KASSERT(vp->v_freelisthd == NULL);
603 KASSERT(LIST_EMPTY(&vp->v_nclist)); 604 KASSERT(LIST_EMPTY(&vp->v_nclist));
604 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 605 KASSERT(LIST_EMPTY(&vp->v_dnclist));
605 606
606 vp->v_type = VNON; 607 vp->v_type = VNON;
607 vp->v_vnlock = &vp->v_lock; 608 vp->v_vnlock = &vp->v_lock;
608 vp->v_tag = tag; 609 vp->v_tag = tag;
609 vp->v_op = vops; 610 vp->v_op = vops;
610 insmntque(vp, mp); 611 insmntque(vp, mp);
611 *vpp = vp; 612 *vpp = vp;
612 vp->v_data = 0; 613 vp->v_data = 0;
613 614
614 /* 615 /*
615 * initialize uvm_object within vnode. 616 * initialize uvm_object within vnode.
616 */ 617 */
617 618
618 uobj = &vp->v_uobj; 619 uobj = &vp->v_uobj;
619 KASSERT(uobj->pgops == &uvm_vnodeops); 620 KASSERT(uobj->pgops == &uvm_vnodeops);
620 KASSERT(uobj->uo_npages == 0); 621 KASSERT(uobj->uo_npages == 0);
621 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 622 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
622 vp->v_size = vp->v_writesize = VSIZENOTSET; 623 vp->v_size = vp->v_writesize = VSIZENOTSET;
623 624
624 if (mp != NULL) { 625 if (mp != NULL) {
625 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 626 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
626 vp->v_vflag |= VV_MPSAFE; 627 vp->v_vflag |= VV_MPSAFE;
627 vfs_unbusy(mp, true, NULL); 628 vfs_unbusy(mp, true, NULL);
628 } 629 }
629 630
630 return (0); 631 return (0);
631} 632}
632 633
633/* 634/*
634 * This is really just the reverse of getnewvnode(). Needed for 635 * This is really just the reverse of getnewvnode(). Needed for
635 * VFS_VGET functions who may need to push back a vnode in case 636 * VFS_VGET functions who may need to push back a vnode in case
636 * of a locking race. 637 * of a locking race.
637 */ 638 */
638void 639void
639ungetnewvnode(vnode_t *vp) 640ungetnewvnode(vnode_t *vp)
640{ 641{
641 642
642 KASSERT(vp->v_usecount == 1); 643 KASSERT(vp->v_usecount == 1);
643 KASSERT(vp->v_data == NULL); 644 KASSERT(vp->v_data == NULL);
644 KASSERT(vp->v_freelisthd == NULL); 645 KASSERT(vp->v_freelisthd == NULL);
645 646
646 mutex_enter(&vp->v_interlock); 647 mutex_enter(&vp->v_interlock);
647 vp->v_iflag |= VI_CLEAN; 648 vp->v_iflag |= VI_CLEAN;
648 vrelel(vp, 0); 649 vrelel(vp, 0);
649} 650}
650 651
651/* 652/*
652 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 653 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
653 * marker vnode and we are prepared to wait for the allocation. 654 * marker vnode and we are prepared to wait for the allocation.
654 */ 655 */
655vnode_t * 656vnode_t *
656vnalloc(struct mount *mp) 657vnalloc(struct mount *mp)
657{ 658{
658 vnode_t *vp; 659 vnode_t *vp;
659 660
660 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 661 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
661 if (vp == NULL) { 662 if (vp == NULL) {
662 return NULL; 663 return NULL;
663 } 664 }
664 665
665 memset(vp, 0, sizeof(*vp)); 666 memset(vp, 0, sizeof(*vp));
666 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 667 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
667 cv_init(&vp->v_cv, "vnode"); 668 cv_init(&vp->v_cv, "vnode");
668 /* 669 /*
669 * done by memset() above. 670 * done by memset() above.
670 * LIST_INIT(&vp->v_nclist); 671 * LIST_INIT(&vp->v_nclist);
671 * LIST_INIT(&vp->v_dnclist); 672 * LIST_INIT(&vp->v_dnclist);
672 */ 673 */
673 674
674 if (mp != NULL) { 675 if (mp != NULL) {
675 vp->v_mount = mp; 676 vp->v_mount = mp;
676 vp->v_type = VBAD; 677 vp->v_type = VBAD;
677 vp->v_iflag = VI_MARKER; 678 vp->v_iflag = VI_MARKER;
678 } else { 679 } else {
679 rw_init(&vp->v_lock.vl_lock); 680 rw_init(&vp->v_lock.vl_lock);
680 } 681 }
681 682
682 return vp; 683 return vp;
683} 684}
684 685
685/* 686/*
686 * Free an unused, unreferenced vnode. 687 * Free an unused, unreferenced vnode.
687 */ 688 */
688void 689void
689vnfree(vnode_t *vp) 690vnfree(vnode_t *vp)
690{ 691{
691 692
692 KASSERT(vp->v_usecount == 0); 693 KASSERT(vp->v_usecount == 0);
693 694
694 if ((vp->v_iflag & VI_MARKER) == 0) { 695 if ((vp->v_iflag & VI_MARKER) == 0) {
695 rw_destroy(&vp->v_lock.vl_lock); 696 rw_destroy(&vp->v_lock.vl_lock);
696 mutex_enter(&vnode_free_list_lock); 697 mutex_enter(&vnode_free_list_lock);
697 numvnodes--; 698 numvnodes--;
698 mutex_exit(&vnode_free_list_lock); 699 mutex_exit(&vnode_free_list_lock);
699 } 700 }
700 701
701 UVM_OBJ_DESTROY(&vp->v_uobj); 702 UVM_OBJ_DESTROY(&vp->v_uobj);
702 cv_destroy(&vp->v_cv); 703 cv_destroy(&vp->v_cv);
703 pool_cache_put(vnode_cache, vp); 704 pool_cache_put(vnode_cache, vp);
704} 705}
705 706
706/* 707/*
707 * Remove a vnode from its freelist. 708 * Remove a vnode from its freelist.
708 */ 709 */
709static inline void 710static inline void
710vremfree(vnode_t *vp) 711vremfree(vnode_t *vp)
711{ 712{
712 713
713 KASSERT(mutex_owned(&vp->v_interlock)); 714 KASSERT(mutex_owned(&vp->v_interlock));
714 KASSERT(vp->v_usecount == 0); 715 KASSERT(vp->v_usecount == 0);
715 716
716 /* 717 /*
717 * Note that the reference count must not change until 718 * Note that the reference count must not change until
718 * the vnode is removed. 719 * the vnode is removed.
719 */ 720 */
720 mutex_enter(&vnode_free_list_lock); 721 mutex_enter(&vnode_free_list_lock);
721 if (vp->v_holdcnt > 0) { 722 if (vp->v_holdcnt > 0) {
722 KASSERT(vp->v_freelisthd == &vnode_hold_list); 723 KASSERT(vp->v_freelisthd == &vnode_hold_list);
723 } else { 724 } else {
724 KASSERT(vp->v_freelisthd == &vnode_free_list); 725 KASSERT(vp->v_freelisthd == &vnode_free_list);
725 } 726 }
726 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 727 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
727 vp->v_freelisthd = NULL; 728 vp->v_freelisthd = NULL;
728 mutex_exit(&vnode_free_list_lock); 729 mutex_exit(&vnode_free_list_lock);
729} 730}
730 731
731/* 732/*
732 * Move a vnode from one mount queue to another. 733 * Move a vnode from one mount queue to another.
733 */ 734 */
734static void 735static void
735insmntque(vnode_t *vp, struct mount *mp) 736insmntque(vnode_t *vp, struct mount *mp)
736{ 737{
737 struct mount *omp; 738 struct mount *omp;
738 739
739#ifdef DIAGNOSTIC 740#ifdef DIAGNOSTIC
740 if ((mp != NULL) && 741 if ((mp != NULL) &&
741 (mp->mnt_iflag & IMNT_UNMOUNT) && 742 (mp->mnt_iflag & IMNT_UNMOUNT) &&
742 !(mp->mnt_flag & MNT_SOFTDEP) && 743 !(mp->mnt_flag & MNT_SOFTDEP) &&
743 vp->v_tag != VT_VFS) { 744 vp->v_tag != VT_VFS) {
744 panic("insmntque into dying filesystem"); 745 panic("insmntque into dying filesystem");
745 } 746 }
746#endif 747#endif
747 748
748 mutex_enter(&mntvnode_lock); 749 mutex_enter(&mntvnode_lock);
749 /* 750 /*
750 * Delete from old mount point vnode list, if on one. 751 * Delete from old mount point vnode list, if on one.
751 */ 752 */
752 if ((omp = vp->v_mount) != NULL) 753 if ((omp = vp->v_mount) != NULL)
753 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 754 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
754 /* 755 /*
755 * Insert into list of vnodes for the new mount point, if 756 * Insert into list of vnodes for the new mount point, if
756 * available. The caller must take a reference on the mount 757 * available. The caller must take a reference on the mount
757 * structure and donate to the vnode. 758 * structure and donate to the vnode.
758 */ 759 */
759 if ((vp->v_mount = mp) != NULL) 760 if ((vp->v_mount = mp) != NULL)
760 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 761 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
761 mutex_exit(&mntvnode_lock); 762 mutex_exit(&mntvnode_lock);
762 763
763 if (omp != NULL) { 764 if (omp != NULL) {
764 /* Release reference to old mount. */ 765 /* Release reference to old mount. */
765 vfs_destroy(omp); 766 vfs_destroy(omp);
766 } 767 }
767} 768}
768 769
769/* 770/*
770 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 771 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or
771 * recycled. 772 * recycled.
772 */ 773 */
773void 774void
774vwait(vnode_t *vp, int flags) 775vwait(vnode_t *vp, int flags)
775{ 776{
776 777
777 KASSERT(mutex_owned(&vp->v_interlock)); 778 KASSERT(mutex_owned(&vp->v_interlock));
778 KASSERT(vp->v_usecount != 0); 779 KASSERT(vp->v_usecount != 0);
779 780
780 while ((vp->v_iflag & flags) != 0) 781 while ((vp->v_iflag & flags) != 0)
781 cv_wait(&vp->v_cv, &vp->v_interlock); 782 cv_wait(&vp->v_cv, &vp->v_interlock);
782} 783}
783 784
784/* 785/*
785 * Insert a marker vnode into a mount's vnode list, after the 786 * Insert a marker vnode into a mount's vnode list, after the
786 * specified vnode. mntvnode_lock must be held. 787 * specified vnode. mntvnode_lock must be held.
787 */ 788 */
788void 789void
789vmark(vnode_t *mvp, vnode_t *vp) 790vmark(vnode_t *mvp, vnode_t *vp)
790{ 791{
791 struct mount *mp; 792 struct mount *mp;
792 793
793 mp = mvp->v_mount; 794 mp = mvp->v_mount;
794 795
795 KASSERT(mutex_owned(&mntvnode_lock)); 796 KASSERT(mutex_owned(&mntvnode_lock));
796 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 797 KASSERT((mvp->v_iflag & VI_MARKER) != 0);
797 KASSERT(vp->v_mount == mp); 798 KASSERT(vp->v_mount == mp);
798 799
799 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes); 800 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
800} 801}
801 802
802/* 803/*
803 * Remove a marker vnode from a mount's vnode list, and return 804 * Remove a marker vnode from a mount's vnode list, and return
804 * a pointer to the next vnode in the list. mntvnode_lock must 805 * a pointer to the next vnode in the list. mntvnode_lock must
805 * be held. 806 * be held.
806 */ 807 */
807vnode_t * 808vnode_t *
808vunmark(vnode_t *mvp) 809vunmark(vnode_t *mvp)
809{ 810{
810 vnode_t *vp; 811 vnode_t *vp;
811 struct mount *mp; 812 struct mount *mp;
812 813
813 mp = mvp->v_mount; 814 mp = mvp->v_mount;
814 815
815 KASSERT(mutex_owned(&mntvnode_lock)); 816 KASSERT(mutex_owned(&mntvnode_lock));
816 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 817 KASSERT((mvp->v_iflag & VI_MARKER) != 0);
817 818
818 vp = TAILQ_NEXT(mvp, v_mntvnodes); 819 vp = TAILQ_NEXT(mvp, v_mntvnodes);
819 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);  820 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
820 821
821 KASSERT(vp == NULL || vp->v_mount == mp); 822 KASSERT(vp == NULL || vp->v_mount == mp);
822 823
823 return vp; 824 return vp;
824} 825}
825 826
826/* 827/*
827 * Update outstanding I/O count and do wakeup if requested. 828 * Update outstanding I/O count and do wakeup if requested.
828 */ 829 */
829void 830void
830vwakeup(struct buf *bp) 831vwakeup(struct buf *bp)
831{ 832{
832 struct vnode *vp; 833 struct vnode *vp;
833 834
834 if ((vp = bp->b_vp) == NULL) 835 if ((vp = bp->b_vp) == NULL)
835 return; 836 return;
836 837
837 KASSERT(bp->b_objlock == &vp->v_interlock); 838 KASSERT(bp->b_objlock == &vp->v_interlock);
838 KASSERT(mutex_owned(bp->b_objlock)); 839 KASSERT(mutex_owned(bp->b_objlock));
839 840
840 if (--vp->v_numoutput < 0) 841 if (--vp->v_numoutput < 0)
841 panic("vwakeup: neg numoutput, vp %p", vp); 842 panic("vwakeup: neg numoutput, vp %p", vp);
842 if (vp->v_numoutput == 0) 843 if (vp->v_numoutput == 0)
843 cv_broadcast(&vp->v_cv); 844 cv_broadcast(&vp->v_cv);
844} 845}
845 846
846/* 847/*
847 * Flush out and invalidate all buffers associated with a vnode. 848 * Flush out and invalidate all buffers associated with a vnode.
848 * Called with the underlying vnode locked, which should prevent new dirty 849 * Called with the underlying vnode locked, which should prevent new dirty
849 * buffers from being queued. 850 * buffers from being queued.
850 */ 851 */
851int 852int
852vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, 853vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
853 bool catch, int slptimeo) 854 bool catch, int slptimeo)
854{ 855{
855 struct buf *bp, *nbp; 856 struct buf *bp, *nbp;
856 int error; 857 int error;
857 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 858 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
858 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); 859 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
859 860
860 /* XXXUBC this doesn't look at flags or slp* */ 861 /* XXXUBC this doesn't look at flags or slp* */
861 mutex_enter(&vp->v_interlock); 862 mutex_enter(&vp->v_interlock);
862 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 863 error = VOP_PUTPAGES(vp, 0, 0, flushflags);
863 if (error) { 864 if (error) {
864 return error; 865 return error;
865 } 866 }
866 867
867 if (flags & V_SAVE) { 868 if (flags & V_SAVE) {
868 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); 869 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
869 if (error) 870 if (error)
870 return (error); 871 return (error);
871 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); 872 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
872 } 873 }
873 874
874 mutex_enter(&bufcache_lock); 875 mutex_enter(&bufcache_lock);
875restart: 876restart:
876 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 877 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
877 nbp = LIST_NEXT(bp, b_vnbufs); 878 nbp = LIST_NEXT(bp, b_vnbufs);
878 error = bbusy(bp, catch, slptimeo, NULL); 879 error = bbusy(bp, catch, slptimeo, NULL);
879 if (error != 0) { 880 if (error != 0) {
880 if (error == EPASSTHROUGH) 881 if (error == EPASSTHROUGH)
881 goto restart; 882 goto restart;
882 mutex_exit(&bufcache_lock); 883 mutex_exit(&bufcache_lock);
883 return (error); 884 return (error);
884 } 885 }
885 brelsel(bp, BC_INVAL | BC_VFLUSH); 886 brelsel(bp, BC_INVAL | BC_VFLUSH);
886 } 887 }
887 888
888 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 889 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
889 nbp = LIST_NEXT(bp, b_vnbufs); 890 nbp = LIST_NEXT(bp, b_vnbufs);
890 error = bbusy(bp, catch, slptimeo, NULL); 891 error = bbusy(bp, catch, slptimeo, NULL);
891 if (error != 0) { 892 if (error != 0) {
892 if (error == EPASSTHROUGH) 893 if (error == EPASSTHROUGH)
893 goto restart; 894 goto restart;
894 mutex_exit(&bufcache_lock); 895 mutex_exit(&bufcache_lock);
895 return (error); 896 return (error);
896 } 897 }
897 /* 898 /*
898 * XXX Since there are no node locks for NFS, I believe 899 * XXX Since there are no node locks for NFS, I believe
899 * there is a slight chance that a delayed write will 900 * there is a slight chance that a delayed write will
900 * occur while sleeping just above, so check for it. 901 * occur while sleeping just above, so check for it.
901 */ 902 */
902 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { 903 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
903#ifdef DEBUG 904#ifdef DEBUG
904 printf("buffer still DELWRI\n"); 905 printf("buffer still DELWRI\n");
905#endif 906#endif
906 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 907 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
907 mutex_exit(&bufcache_lock); 908 mutex_exit(&bufcache_lock);
908 VOP_BWRITE(bp); 909 VOP_BWRITE(bp);
909 mutex_enter(&bufcache_lock); 910 mutex_enter(&bufcache_lock);
910 goto restart; 911 goto restart;
911 } 912 }
912 brelsel(bp, BC_INVAL | BC_VFLUSH); 913 brelsel(bp, BC_INVAL | BC_VFLUSH);
913 } 914 }
914 915
915#ifdef DIAGNOSTIC 916#ifdef DIAGNOSTIC
916 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 917 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
917 panic("vinvalbuf: flush failed, vp %p", vp); 918 panic("vinvalbuf: flush failed, vp %p", vp);
918#endif 919#endif
919 920
920 mutex_exit(&bufcache_lock); 921 mutex_exit(&bufcache_lock);
921 922
922 return (0); 923 return (0);
923} 924}
924 925
925/* 926/*
926 * Destroy any in core blocks past the truncation length. 927 * Destroy any in core blocks past the truncation length.
927 * Called with the underlying vnode locked, which should prevent new dirty 928 * Called with the underlying vnode locked, which should prevent new dirty
928 * buffers from being queued. 929 * buffers from being queued.
929 */ 930 */
930int 931int
931vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo) 932vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
932{ 933{
933 struct buf *bp, *nbp; 934 struct buf *bp, *nbp;
934 int error; 935 int error;
935 voff_t off; 936 voff_t off;
936 937
937 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 938 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
938 mutex_enter(&vp->v_interlock); 939 mutex_enter(&vp->v_interlock);
939 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 940 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
940 if (error) { 941 if (error) {
941 return error; 942 return error;
942 } 943 }
943 944
944 mutex_enter(&bufcache_lock); 945 mutex_enter(&bufcache_lock);
945restart: 946restart:
946 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 947 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
947 nbp = LIST_NEXT(bp, b_vnbufs); 948 nbp = LIST_NEXT(bp, b_vnbufs);
948 if (bp->b_lblkno < lbn) 949 if (bp->b_lblkno < lbn)
949 continue; 950 continue;
950 error = bbusy(bp, catch, slptimeo, NULL); 951 error = bbusy(bp, catch, slptimeo, NULL);
951 if (error != 0) { 952 if (error != 0) {
952 if (error == EPASSTHROUGH) 953 if (error == EPASSTHROUGH)
953 goto restart; 954 goto restart;
954 mutex_exit(&bufcache_lock); 955 mutex_exit(&bufcache_lock);
955 return (error); 956 return (error);
956 } 957 }
957 brelsel(bp, BC_INVAL | BC_VFLUSH); 958 brelsel(bp, BC_INVAL | BC_VFLUSH);
958 } 959 }
959 960
960 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 961 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
961 nbp = LIST_NEXT(bp, b_vnbufs); 962 nbp = LIST_NEXT(bp, b_vnbufs);
962 if (bp->b_lblkno < lbn) 963 if (bp->b_lblkno < lbn)
963 continue; 964 continue;
964 error = bbusy(bp, catch, slptimeo, NULL); 965 error = bbusy(bp, catch, slptimeo, NULL);
965 if (error != 0) { 966 if (error != 0) {
966 if (error == EPASSTHROUGH) 967 if (error == EPASSTHROUGH)
967 goto restart; 968 goto restart;
968 mutex_exit(&bufcache_lock); 969 mutex_exit(&bufcache_lock);
969 return (error); 970 return (error);
970 } 971 }
971 brelsel(bp, BC_INVAL | BC_VFLUSH); 972 brelsel(bp, BC_INVAL | BC_VFLUSH);
972 } 973 }
973 mutex_exit(&bufcache_lock); 974 mutex_exit(&bufcache_lock);
974 975
975 return (0); 976 return (0);
976} 977}
977 978
978/* 979/*
979 * Flush all dirty buffers from a vnode. 980 * Flush all dirty buffers from a vnode.
980 * Called with the underlying vnode locked, which should prevent new dirty 981 * Called with the underlying vnode locked, which should prevent new dirty
981 * buffers from being queued. 982 * buffers from being queued.
982 */ 983 */
983void 984void
984vflushbuf(struct vnode *vp, int sync) 985vflushbuf(struct vnode *vp, int sync)
985{ 986{
986 struct buf *bp, *nbp; 987 struct buf *bp, *nbp;
987 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 988 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
988 bool dirty; 989 bool dirty;
989 990
990 mutex_enter(&vp->v_interlock); 991 mutex_enter(&vp->v_interlock);
991 (void) VOP_PUTPAGES(vp, 0, 0, flags); 992 (void) VOP_PUTPAGES(vp, 0, 0, flags);
992 993
993loop: 994loop:
994 mutex_enter(&bufcache_lock); 995 mutex_enter(&bufcache_lock);
995 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 996 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
996 nbp = LIST_NEXT(bp, b_vnbufs); 997 nbp = LIST_NEXT(bp, b_vnbufs);
997 if ((bp->b_cflags & BC_BUSY)) 998 if ((bp->b_cflags & BC_BUSY))
998 continue; 999 continue;
999 if ((bp->b_oflags & BO_DELWRI) == 0) 1000 if ((bp->b_oflags & BO_DELWRI) == 0)
1000 panic("vflushbuf: not dirty, bp %p", bp); 1001 panic("vflushbuf: not dirty, bp %p", bp);
1001 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 1002 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
1002 mutex_exit(&bufcache_lock); 1003 mutex_exit(&bufcache_lock);
1003 /* 1004 /*
1004 * Wait for I/O associated with indirect blocks to complete, 1005 * Wait for I/O associated with indirect blocks to complete,
1005 * since there is no way to quickly wait for them below. 1006 * since there is no way to quickly wait for them below.
1006 */ 1007 */
1007 if (bp->b_vp == vp || sync == 0) 1008 if (bp->b_vp == vp || sync == 0)
1008 (void) bawrite(bp); 1009 (void) bawrite(bp);
1009 else 1010 else
1010 (void) bwrite(bp); 1011 (void) bwrite(bp);
1011 goto loop; 1012 goto loop;
1012 } 1013 }
1013 mutex_exit(&bufcache_lock); 1014 mutex_exit(&bufcache_lock);
1014 1015
1015 if (sync == 0) 1016 if (sync == 0)
1016 return; 1017 return;
1017 1018
1018 mutex_enter(&vp->v_interlock); 1019 mutex_enter(&vp->v_interlock);
1019 while (vp->v_numoutput != 0) 1020 while (vp->v_numoutput != 0)
1020 cv_wait(&vp->v_cv, &vp->v_interlock); 1021 cv_wait(&vp->v_cv, &vp->v_interlock);
1021 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); 1022 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
1022 mutex_exit(&vp->v_interlock); 1023 mutex_exit(&vp->v_interlock);
1023 1024
1024 if (dirty) { 1025 if (dirty) {
1025 vprint("vflushbuf: dirty", vp); 1026 vprint("vflushbuf: dirty", vp);
1026 goto loop; 1027 goto loop;
1027 } 1028 }
1028} 1029}
1029 1030
1030/* 1031/*
1031 * Create a vnode for a block device. 1032 * Create a vnode for a block device.
1032 * Used for root filesystem and swap areas. 1033 * Used for root filesystem and swap areas.
1033 * Also used for memory file system special devices. 1034 * Also used for memory file system special devices.
1034 */ 1035 */
1035int 1036int
1036bdevvp(dev_t dev, vnode_t **vpp) 1037bdevvp(dev_t dev, vnode_t **vpp)
1037{ 1038{
1038 1039
1039 return (getdevvp(dev, vpp, VBLK)); 1040 return (getdevvp(dev, vpp, VBLK));
1040} 1041}
1041 1042
1042/* 1043/*
1043 * Create a vnode for a character device. 1044 * Create a vnode for a character device.
1044 * Used for kernfs and some console handling. 1045 * Used for kernfs and some console handling.
1045 */ 1046 */
1046int 1047int
1047cdevvp(dev_t dev, vnode_t **vpp) 1048cdevvp(dev_t dev, vnode_t **vpp)
1048{ 1049{
1049 1050
1050 return (getdevvp(dev, vpp, VCHR)); 1051 return (getdevvp(dev, vpp, VCHR));
1051} 1052}
1052 1053
1053/* 1054/*
1054 * Associate a buffer with a vnode. There must already be a hold on 1055 * Associate a buffer with a vnode. There must already be a hold on
1055 * the vnode. 1056 * the vnode.
1056 */ 1057 */
1057void 1058void
1058bgetvp(struct vnode *vp, struct buf *bp) 1059bgetvp(struct vnode *vp, struct buf *bp)
1059{ 1060{
1060 1061
1061 KASSERT(bp->b_vp == NULL); 1062 KASSERT(bp->b_vp == NULL);
1062 KASSERT(bp->b_objlock == &buffer_lock); 1063 KASSERT(bp->b_objlock == &buffer_lock);
1063 KASSERT(mutex_owned(&vp->v_interlock)); 1064 KASSERT(mutex_owned(&vp->v_interlock));
1064 KASSERT(mutex_owned(&bufcache_lock)); 1065 KASSERT(mutex_owned(&bufcache_lock));
1065 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1066 KASSERT((bp->b_cflags & BC_BUSY) != 0);
1066 KASSERT(!cv_has_waiters(&bp->b_done)); 1067 KASSERT(!cv_has_waiters(&bp->b_done));
1067 1068
1068 vholdl(vp); 1069 vholdl(vp);
1069 bp->b_vp = vp; 1070 bp->b_vp = vp;
1070 if (vp->v_type == VBLK || vp->v_type == VCHR) 1071 if (vp->v_type == VBLK || vp->v_type == VCHR)
1071 bp->b_dev = vp->v_rdev; 1072 bp->b_dev = vp->v_rdev;
1072 else 1073 else
1073 bp->b_dev = NODEV; 1074 bp->b_dev = NODEV;
1074 1075
1075 /* 1076 /*
1076 * Insert onto list for new vnode. 1077 * Insert onto list for new vnode.
1077 */ 1078 */
1078 bufinsvn(bp, &vp->v_cleanblkhd); 1079 bufinsvn(bp, &vp->v_cleanblkhd);
1079 bp->b_objlock = &vp->v_interlock; 1080 bp->b_objlock = &vp->v_interlock;
1080} 1081}
1081 1082
1082/* 1083/*
1083 * Disassociate a buffer from a vnode. 1084 * Disassociate a buffer from a vnode.
1084 */ 1085 */
1085void 1086void
1086brelvp(struct buf *bp) 1087brelvp(struct buf *bp)
1087{ 1088{
1088 struct vnode *vp = bp->b_vp; 1089 struct vnode *vp = bp->b_vp;
1089 1090
1090 KASSERT(vp != NULL); 1091 KASSERT(vp != NULL);
1091 KASSERT(bp->b_objlock == &vp->v_interlock); 1092 KASSERT(bp->b_objlock == &vp->v_interlock);
1092 KASSERT(mutex_owned(&vp->v_interlock)); 1093 KASSERT(mutex_owned(&vp->v_interlock));
1093 KASSERT(mutex_owned(&bufcache_lock)); 1094 KASSERT(mutex_owned(&bufcache_lock));
1094 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1095 KASSERT((bp->b_cflags & BC_BUSY) != 0);
1095 KASSERT(!cv_has_waiters(&bp->b_done)); 1096 KASSERT(!cv_has_waiters(&bp->b_done));
1096 1097
1097 /* 1098 /*
1098 * Delete from old vnode list, if on one. 1099 * Delete from old vnode list, if on one.
1099 */ 1100 */
1100 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1101 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1101 bufremvn(bp); 1102 bufremvn(bp);
1102 1103
1103 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) && 1104 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) &&
1104 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1105 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1105 vp->v_iflag &= ~VI_WRMAPDIRTY; 1106 vp->v_iflag &= ~VI_WRMAPDIRTY;
1106 vn_syncer_remove_from_worklist(vp); 1107 vn_syncer_remove_from_worklist(vp);
1107 } 1108 }
1108 1109
1109 bp->b_objlock = &buffer_lock; 1110 bp->b_objlock = &buffer_lock;
1110 bp->b_vp = NULL; 1111 bp->b_vp = NULL;
1111 holdrelel(vp); 1112 holdrelel(vp);
1112} 1113}
1113 1114
1114/* 1115/*
1115 * Reassign a buffer from one vnode list to another. 1116 * Reassign a buffer from one vnode list to another.
1116 * The list reassignment must be within the same vnode. 1117 * The list reassignment must be within the same vnode.
1117 * Used to assign file specific control information 1118 * Used to assign file specific control information
1118 * (indirect blocks) to the list to which they belong. 1119 * (indirect blocks) to the list to which they belong.
1119 */ 1120 */
1120void 1121void
1121reassignbuf(struct buf *bp, struct vnode *vp) 1122reassignbuf(struct buf *bp, struct vnode *vp)
1122{ 1123{
1123 struct buflists *listheadp; 1124 struct buflists *listheadp;
1124 int delayx; 1125 int delayx;
1125 1126
1126 KASSERT(mutex_owned(&bufcache_lock)); 1127 KASSERT(mutex_owned(&bufcache_lock));
1127 KASSERT(bp->b_objlock == &vp->v_interlock); 1128 KASSERT(bp->b_objlock == &vp->v_interlock);
1128 KASSERT(mutex_owned(&vp->v_interlock)); 1129 KASSERT(mutex_owned(&vp->v_interlock));
1129 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1130 KASSERT((bp->b_cflags & BC_BUSY) != 0);
1130 1131
1131 /* 1132 /*
1132 * Delete from old vnode list, if on one. 1133 * Delete from old vnode list, if on one.
1133 */ 1134 */
1134 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1135 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1135 bufremvn(bp); 1136 bufremvn(bp);
1136 1137
1137 /* 1138 /*
1138 * If dirty, put on list of dirty buffers; 1139 * If dirty, put on list of dirty buffers;
1139 * otherwise insert onto list of clean buffers. 1140 * otherwise insert onto list of clean buffers.
1140 */ 1141 */
1141 if ((bp->b_oflags & BO_DELWRI) == 0) { 1142 if ((bp->b_oflags & BO_DELWRI) == 0) {
1142 listheadp = &vp->v_cleanblkhd; 1143 listheadp = &vp->v_cleanblkhd;
1143 if (TAILQ_EMPTY(&vp->v_uobj.memq) && 1144 if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
1144 (vp->v_iflag & VI_ONWORKLST) && 1145 (vp->v_iflag & VI_ONWORKLST) &&
1145 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1146 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1146 vp->v_iflag &= ~VI_WRMAPDIRTY; 1147 vp->v_iflag &= ~VI_WRMAPDIRTY;
1147 vn_syncer_remove_from_worklist(vp); 1148 vn_syncer_remove_from_worklist(vp);
1148 } 1149 }
1149 } else { 1150 } else {
1150 listheadp = &vp->v_dirtyblkhd; 1151 listheadp = &vp->v_dirtyblkhd;
1151 if ((vp->v_iflag & VI_ONWORKLST) == 0) { 1152 if ((vp->v_iflag & VI_ONWORKLST) == 0) {
1152 switch (vp->v_type) { 1153 switch (vp->v_type) {
1153 case VDIR: 1154 case VDIR:
1154 delayx = dirdelay; 1155 delayx = dirdelay;
1155 break; 1156 break;
1156 case VBLK: 1157 case VBLK:
1157 if (vp->v_specmountpoint != NULL) { 1158 if (vp->v_specmountpoint != NULL) {
1158 delayx = metadelay; 1159 delayx = metadelay;
1159 break; 1160 break;
1160 } 1161 }
1161 /* fall through */ 1162 /* fall through */
1162 default: 1163 default:
1163 delayx = filedelay; 1164 delayx = filedelay;
1164 break; 1165 break;
1165 } 1166 }
1166 if (!vp->v_mount || 1167 if (!vp->v_mount ||
1167 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1168 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1168 vn_syncer_add_to_worklist(vp, delayx); 1169 vn_syncer_add_to_worklist(vp, delayx);
1169 } 1170 }
1170 } 1171 }
1171 bufinsvn(bp, listheadp); 1172 bufinsvn(bp, listheadp);
1172} 1173}
1173 1174
1174/* 1175/*
1175 * Create a vnode for a device. 1176 * Create a vnode for a device.
1176 * Used by bdevvp (block device) for root file system etc., 1177 * Used by bdevvp (block device) for root file system etc.,
1177 * and by cdevvp (character device) for console and kernfs. 1178 * and by cdevvp (character device) for console and kernfs.
1178 */ 1179 */
1179static int 1180static int
1180getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 1181getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1181{ 1182{
1182 vnode_t *vp; 1183 vnode_t *vp;
1183 vnode_t *nvp; 1184 vnode_t *nvp;
1184 int error; 1185 int error;
1185 1186
1186 if (dev == NODEV) { 1187 if (dev == NODEV) {
1187 *vpp = NULL; 1188 *vpp = NULL;
1188 return (0); 1189 return (0);
1189 } 1190 }
1190 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1191 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1191 if (error) { 1192 if (error) {
1192 *vpp = NULL; 1193 *vpp = NULL;
1193 return (error); 1194 return (error);
1194 } 1195 }
1195 vp = nvp; 1196 vp = nvp;
1196 vp->v_type = type; 1197 vp->v_type = type;
1197 vp->v_vflag |= VV_MPSAFE; 1198 vp->v_vflag |= VV_MPSAFE;
1198 uvm_vnp_setsize(vp, 0); 1199 uvm_vnp_setsize(vp, 0);
1199 spec_node_init(vp, dev); 1200 spec_node_init(vp, dev);
1200 *vpp = vp; 1201 *vpp = vp;
1201 return (0); 1202 return (0);
1202} 1203}
1203 1204
1204/* 1205/*
1205 * Try to gain a reference to a vnode, without acquiring its interlock. 1206 * Try to gain a reference to a vnode, without acquiring its interlock.
1206 * The caller must hold a lock that will prevent the vnode from being 1207 * The caller must hold a lock that will prevent the vnode from being
1207 * recycled or freed. 1208 * recycled or freed.
1208 */ 1209 */
1209bool 1210bool
1210vtryget(vnode_t *vp) 1211vtryget(vnode_t *vp)
1211{ 1212{
1212 u_int use, next; 1213 u_int use, next;
1213 1214
1214 /* 1215 /*
1215 * If the vnode is being freed, don't make life any harder 1216 * If the vnode is being freed, don't make life any harder
1216 * for vclean() by adding another reference without waiting. 1217 * for vclean() by adding another reference without waiting.
1217 * This is not strictly necessary, but we'll do it anyway. 1218 * This is not strictly necessary, but we'll do it anyway.
1218 */ 1219 */
1219 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) { 1220 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) {
1220 return false; 1221 return false;
1221 } 1222 }
1222 for (use = vp->v_usecount;; use = next) { 1223 for (use = vp->v_usecount;; use = next) {
1223 if (use == 0) {  1224 if (use == 0) {
1224 /* Need interlock held if first reference. */ 1225 /* Need interlock held if first reference. */
1225 return false; 1226 return false;
1226 } 1227 }
1227 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 1228 next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
1228 if (__predict_true(next == use)) { 1229 if (__predict_true(next == use)) {
1229 return true; 1230 return true;
1230 } 1231 }
1231 } 1232 }
1232} 1233}
1233 1234
1234/* 1235/*
1235 * Grab a particular vnode from the free list, increment its 1236 * Grab a particular vnode from the free list, increment its
1236 * reference count and lock it. If the vnode lock bit is set the 1237 * reference count and lock it. If the vnode lock bit is set the
1237 * vnode is being eliminated in vgone. In that case, we can not 1238 * vnode is being eliminated in vgone. In that case, we can not
1238 * grab the vnode, so the process is awakened when the transition is 1239 * grab the vnode, so the process is awakened when the transition is
1239 * completed, and an error returned to indicate that the vnode is no 1240 * completed, and an error returned to indicate that the vnode is no
1240 * longer usable (possibly having been changed to a new file system type). 1241 * longer usable (possibly having been changed to a new file system type).
1241 */ 1242 */
1242int 1243int
1243vget(vnode_t *vp, int flags) 1244vget(vnode_t *vp, int flags)
1244{ 1245{
1245 int error; 1246 int error;
1246 1247
1247 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1248 KASSERT((vp->v_iflag & VI_MARKER) == 0);
1248 1249
1249 if ((flags & LK_INTERLOCK) == 0) 1250 if ((flags & LK_INTERLOCK) == 0)
1250 mutex_enter(&vp->v_interlock); 1251 mutex_enter(&vp->v_interlock);
1251 1252
1252 /* 1253 /*
1253 * Before adding a reference, we must remove the vnode 1254 * Before adding a reference, we must remove the vnode
1254 * from its freelist. 1255 * from its freelist.
1255 */ 1256 */
1256 if (vp->v_usecount == 0) { 1257 if (vp->v_usecount == 0) {
1257 vremfree(vp); 1258 vremfree(vp);
1258 vp->v_usecount = 1; 1259 vp->v_usecount = 1;
1259 } else { 1260 } else {
1260 atomic_inc_uint(&vp->v_usecount); 1261 atomic_inc_uint(&vp->v_usecount);
1261 } 1262 }
1262 1263
1263 /* 1264 /*
1264 * If the vnode is in the process of being cleaned out for 1265 * If the vnode is in the process of being cleaned out for
1265 * another use, we wait for the cleaning to finish and then 1266 * another use, we wait for the cleaning to finish and then
1266 * return failure. Cleaning is determined by checking if 1267 * return failure. Cleaning is determined by checking if
1267 * the VI_XLOCK or VI_FREEING flags are set. 1268 * the VI_XLOCK or VI_FREEING flags are set.
1268 */ 1269 */
1269 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 1270 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
1270 if ((flags & LK_NOWAIT) != 0) { 1271 if ((flags & LK_NOWAIT) != 0) {
1271 vrelel(vp, 0); 1272 vrelel(vp, 0);
1272 return EBUSY; 1273 return EBUSY;
1273 } 1274 }
1274 vwait(vp, VI_XLOCK | VI_FREEING); 1275 vwait(vp, VI_XLOCK | VI_FREEING);
1275 vrelel(vp, 0); 1276 vrelel(vp, 0);
1276 return ENOENT; 1277 return ENOENT;
1277 } 1278 }
1278 if (flags & LK_TYPE_MASK) { 1279 if (flags & LK_TYPE_MASK) {
1279 error = vn_lock(vp, flags | LK_INTERLOCK); 1280 error = vn_lock(vp, flags | LK_INTERLOCK);
1280 if (error != 0) { 1281 if (error != 0) {
1281 vrele(vp); 1282 vrele(vp);
1282 } 1283 }
1283 return error; 1284 return error;
1284 } 1285 }
1285 mutex_exit(&vp->v_interlock); 1286 mutex_exit(&vp->v_interlock);
1286 return 0; 1287 return 0;
1287} 1288}
1288 1289
1289/* 1290/*
1290 * vput(), just unlock and vrele() 1291 * vput(), just unlock and vrele()
1291 */ 1292 */