Tue Jul 21 00:31:58 2009 UTC ()
Pull up following revision(s) (requested by rmind in ticket #863):
	sys/sys/vnode.h: revision 1.207
	sys/kern/vfs_subr.c: revision 1.379
put a flag bit into v_usecount to prevent vtryget during getcleanvnode.
this fixes the following deadlock.
        a thread doing getcleanvnode:
        pick a vnode
        acqure v_interlock
        v_usecount++
        call vclean
                now, another thread doing cache_lookup:
                picks the vnode
                vtryget succeed
                vn_lock succeed
        now in vclean:
        set VI_XLOCK (too late to be noticed by the competing thread)
        wait on the vnode lock (this might violate locking order)
the use of a flag bit was suggested by Andrew Doran.  PR/41374.


(snj)
diff -r1.357.4.4 -r1.357.4.5 src/sys/kern/vfs_subr.c
diff -r1.197 -r1.197.4.1 src/sys/sys/vnode.h

cvs diff -r1.357.4.4 -r1.357.4.5 src/sys/kern/vfs_subr.c (expand / switch to unified diff)

--- src/sys/kern/vfs_subr.c 2009/02/16 03:33:16 1.357.4.4
+++ src/sys/kern/vfs_subr.c 2009/07/21 00:31:58 1.357.4.5
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: vfs_subr.c,v 1.357.4.4 2009/02/16 03:33:16 snj Exp $ */ 1/* $NetBSD: vfs_subr.c,v 1.357.4.5 2009/07/21 00:31:58 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This code is derived from software contributed to The NetBSD Foundation 7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 * 10 *
11 * Redistribution and use in source and binary forms, with or without 11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions 12 * modification, are permitted provided that the following conditions
13 * are met: 13 * are met:
14 * 1. Redistributions of source code must retain the above copyright 14 * 1. Redistributions of source code must retain the above copyright
@@ -66,32 +66,42 @@ @@ -66,32 +66,42 @@
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */ 67 */
68 68
69/* 69/*
70 * Note on v_usecount and locking: 70 * Note on v_usecount and locking:
71 * 71 *
72 * At nearly all points it is known that v_usecount could be zero, the 72 * At nearly all points it is known that v_usecount could be zero, the
73 * vnode interlock will be held. 73 * vnode interlock will be held.
74 * 74 *
75 * To change v_usecount away from zero, the interlock must be held. To 75 * To change v_usecount away from zero, the interlock must be held. To
76 * change from a non-zero value to zero, again the interlock must be 76 * change from a non-zero value to zero, again the interlock must be
77 * held. 77 * held.
78 * 78 *
79 * Changing the usecount from a non-zero value to a non-zero value can 79 * There's a flag bit, VC_XLOCK, embedded in v_usecount.
80 * safely be done using atomic operations, without the interlock held. 80 * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock
 81 * must be held.
 82 * To modify the VC_XLOCK bit, the interlock must be held.
 83 * We always keep the usecount (v_usecount & VC_MASK) non-zero while the
 84 * VC_XLOCK bit is set.
 85 *
 86 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero
 87 * value to a non-zero value can safely be done using atomic operations,
 88 * without the interlock held.
 89 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero
 90 * value can be done using atomic operations, without the interlock held.
81 */ 91 */
82 92
83#include <sys/cdefs.h> 93#include <sys/cdefs.h>
84__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.357.4.4 2009/02/16 03:33:16 snj Exp $"); 94__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.357.4.5 2009/07/21 00:31:58 snj Exp $");
85 95
86#include "opt_ddb.h" 96#include "opt_ddb.h"
87#include "opt_compat_netbsd.h" 97#include "opt_compat_netbsd.h"
88#include "opt_compat_43.h" 98#include "opt_compat_43.h"
89 99
90#include <sys/param.h> 100#include <sys/param.h>
91#include <sys/systm.h> 101#include <sys/systm.h>
92#include <sys/proc.h> 102#include <sys/proc.h>
93#include <sys/kernel.h> 103#include <sys/kernel.h>
94#include <sys/mount.h> 104#include <sys/mount.h>
95#include <sys/fcntl.h> 105#include <sys/fcntl.h>
96#include <sys/vnode.h> 106#include <sys/vnode.h>
97#include <sys/stat.h> 107#include <sys/stat.h>
@@ -356,28 +366,30 @@ try_nextlist: @@ -356,28 +366,30 @@ try_nextlist:
356 } 366 }
357 367
358 /* Remove it from the freelist. */ 368 /* Remove it from the freelist. */
359 TAILQ_REMOVE(listhd, vp, v_freelist); 369 TAILQ_REMOVE(listhd, vp, v_freelist);
360 vp->v_freelisthd = NULL; 370 vp->v_freelisthd = NULL;
361 mutex_exit(&vnode_free_list_lock); 371 mutex_exit(&vnode_free_list_lock);
362 372
363 /* 373 /*
364 * The vnode is still associated with a file system, so we must 374 * The vnode is still associated with a file system, so we must
365 * clean it out before reusing it. We need to add a reference 375 * clean it out before reusing it. We need to add a reference
366 * before doing this. If the vnode gains another reference while 376 * before doing this. If the vnode gains another reference while
367 * being cleaned out then we lose - retry. 377 * being cleaned out then we lose - retry.
368 */ 378 */
369 atomic_inc_uint(&vp->v_usecount); 379 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK);
370 vclean(vp, DOCLOSE); 380 vclean(vp, DOCLOSE);
 381 KASSERT(vp->v_usecount >= 1 + VC_XLOCK);
 382 atomic_add_int(&vp->v_usecount, -VC_XLOCK);
371 if (vp->v_usecount == 1) { 383 if (vp->v_usecount == 1) {
372 /* We're about to dirty it. */ 384 /* We're about to dirty it. */
373 vp->v_iflag &= ~VI_CLEAN; 385 vp->v_iflag &= ~VI_CLEAN;
374 mutex_exit(&vp->v_interlock); 386 mutex_exit(&vp->v_interlock);
375 if (vp->v_type == VBLK || vp->v_type == VCHR) { 387 if (vp->v_type == VBLK || vp->v_type == VCHR) {
376 spec_node_destroy(vp); 388 spec_node_destroy(vp);
377 } 389 }
378 vp->v_type = VNON; 390 vp->v_type = VNON;
379 } else { 391 } else {
380 /* 392 /*
381 * Don't return to freelist - the holder of the last 393 * Don't return to freelist - the holder of the last
382 * reference will destroy it. 394 * reference will destroy it.
383 */ 395 */
@@ -1211,27 +1223,27 @@ bool @@ -1211,27 +1223,27 @@ bool
1211vtryget(vnode_t *vp) 1223vtryget(vnode_t *vp)
1212{ 1224{
1213 u_int use, next; 1225 u_int use, next;
1214 1226
1215 /* 1227 /*
1216 * If the vnode is being freed, don't make life any harder 1228 * If the vnode is being freed, don't make life any harder
1217 * for vclean() by adding another reference without waiting. 1229 * for vclean() by adding another reference without waiting.
1218 * This is not strictly necessary, but we'll do it anyway. 1230 * This is not strictly necessary, but we'll do it anyway.
1219 */ 1231 */
1220 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) { 1232 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) {
1221 return false; 1233 return false;
1222 } 1234 }
1223 for (use = vp->v_usecount;; use = next) { 1235 for (use = vp->v_usecount;; use = next) {
1224 if (use == 0) {  1236 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) {
1225 /* Need interlock held if first reference. */ 1237 /* Need interlock held if first reference. */
1226 return false; 1238 return false;
1227 } 1239 }
1228 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 1240 next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
1229 if (__predict_true(next == use)) { 1241 if (__predict_true(next == use)) {
1230 return true; 1242 return true;
1231 } 1243 }
1232 } 1244 }
1233} 1245}
1234 1246
1235/* 1247/*
1236 * Grab a particular vnode from the free list, increment its 1248 * Grab a particular vnode from the free list, increment its
1237 * reference count and lock it. If the vnode lock bit is set the 1249 * reference count and lock it. If the vnode lock bit is set the
@@ -1300,29 +1312,30 @@ vput(vnode_t *vp) @@ -1300,29 +1312,30 @@ vput(vnode_t *vp)
1300 vrele(vp); 1312 vrele(vp);
1301} 1313}
1302 1314
1303/* 1315/*
1304 * Try to drop reference on a vnode. Abort if we are releasing the 1316 * Try to drop reference on a vnode. Abort if we are releasing the
1305 * last reference. Note: this _must_ succeed if not the last reference. 1317 * last reference. Note: this _must_ succeed if not the last reference.
1306 */ 1318 */
1307static inline bool 1319static inline bool
1308vtryrele(vnode_t *vp) 1320vtryrele(vnode_t *vp)
1309{ 1321{
1310 u_int use, next; 1322 u_int use, next;
1311 1323
1312 for (use = vp->v_usecount;; use = next) { 1324 for (use = vp->v_usecount;; use = next) {
1313 if (use == 1) {  1325 if (use == 1) {
1314 return false; 1326 return false;
1315 } 1327 }
 1328 KASSERT((use & VC_MASK) > 1);
1316 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 1329 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
1317 if (__predict_true(next == use)) { 1330 if (__predict_true(next == use)) {
1318 return true; 1331 return true;
1319 } 1332 }
1320 } 1333 }
1321} 1334}
1322 1335
1323/* 1336/*
1324 * Vnode release. If reference count drops to zero, call inactive 1337 * Vnode release. If reference count drops to zero, call inactive
1325 * routine and either return to freelist or free to the pool. 1338 * routine and either return to freelist or free to the pool.
1326 */ 1339 */
1327void 1340void
1328vrelel(vnode_t *vp, int flags) 1341vrelel(vnode_t *vp, int flags)

cvs diff -r1.197 -r1.197.4.1 src/sys/sys/vnode.h (expand / switch to unified diff)

--- src/sys/sys/vnode.h 2008/07/31 05:38:06 1.197
+++ src/sys/sys/vnode.h 2009/07/21 00:31:58 1.197.4.1
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: vnode.h,v 1.197 2008/07/31 05:38:06 simonb Exp $ */ 1/* $NetBSD: vnode.h,v 1.197.4.1 2009/07/21 00:31:58 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Redistribution and use in source and binary forms, with or without 7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions 8 * modification, are permitted provided that the following conditions
9 * are met: 9 * are met:
10 * 1. Redistributions of source code must retain the above copyright 10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer. 11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright 12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the 13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution. 14 * documentation and/or other materials provided with the distribution.
@@ -239,26 +239,32 @@ typedef struct vnode vnode_t; @@ -239,26 +239,32 @@ typedef struct vnode vnode_t;
239 */ 239 */
240#define VU_DIROP 0x01000000 /* LFS: involved in a directory op */ 240#define VU_DIROP 0x01000000 /* LFS: involved in a directory op */
241#define VU_SOFTDEP 0x02000000 /* FFS: involved in softdep processing */ 241#define VU_SOFTDEP 0x02000000 /* FFS: involved in softdep processing */
242 242
243#define VNODE_FLAGBITS \ 243#define VNODE_FLAGBITS \
244 "\20\1ROOT\2SYSTEM\3ISTTY\4MAPPED\5MPSAFE\6LOCKSWORK\11TEXT\12EXECMAP" \ 244 "\20\1ROOT\2SYSTEM\3ISTTY\4MAPPED\5MPSAFE\6LOCKSWORK\11TEXT\12EXECMAP" \
245 "\13WRMAP\14WRMAPDIRTY\15XLOCK\17ONWORKLST\20MARKER" \ 245 "\13WRMAP\14WRMAPDIRTY\15XLOCK\17ONWORKLST\20MARKER" \
246 "\22LAYER\24CLEAN\25INACTPEND\26INACTREDO\27FREEING" \ 246 "\22LAYER\24CLEAN\25INACTPEND\26INACTREDO\27FREEING" \
247 "\28INACTNOW\31DIROP\32SOFTDEP"  247 "\28INACTNOW\31DIROP\32SOFTDEP"
248 248
249#define VSIZENOTSET ((voff_t)-1) 249#define VSIZENOTSET ((voff_t)-1)
250 250
251/* 251/*
 252 * v_usecount; see the comment in vfs_subr.c
 253 */
 254#define VC_XLOCK 0x80000000
 255#define VC_MASK 0x7fffffff
 256
 257/*
252 * Vnode attributes. A field value of VNOVAL represents a field whose value 258 * Vnode attributes. A field value of VNOVAL represents a field whose value
253 * is unavailable (getattr) or which is not to be changed (setattr). 259 * is unavailable (getattr) or which is not to be changed (setattr).
254 */ 260 */
255struct vattr { 261struct vattr {
256 enum vtype va_type; /* vnode type (for create) */ 262 enum vtype va_type; /* vnode type (for create) */
257 mode_t va_mode; /* files access mode and type */ 263 mode_t va_mode; /* files access mode and type */
258 nlink_t va_nlink; /* number of references to file */ 264 nlink_t va_nlink; /* number of references to file */
259 uid_t va_uid; /* owner user id */ 265 uid_t va_uid; /* owner user id */
260 gid_t va_gid; /* owner group id */ 266 gid_t va_gid; /* owner group id */
261 long va_fsid; /* file system id (dev for now) */ 267 long va_fsid; /* file system id (dev for now) */
262 ino_t va_fileid; /* file id */ 268 ino_t va_fileid; /* file id */
263 u_quad_t va_size; /* file size in bytes */ 269 u_quad_t va_size; /* file size in bytes */
264 long va_blocksize; /* blocksize preferred for i/o */ 270 long va_blocksize; /* blocksize preferred for i/o */