Mon Feb 21 09:29:21 2011 UTC ()
Change the snapshot lock:
- No need to take the snapshot lock while the file system is suspended.
- Allow ffs_copyonwrite() one level of recursion with snapshots locked.
- Do the block address lookup with snapshots locked.
- Take the snapshot lock while removing a snapshot from the list.

While hunting deadlocks change the transaction scope for ffs_snapremove().
We could deadlock from UFS_WAPBL_BEGIN() with a buffer held.


(hannken)
diff -r1.105 -r1.106 src/sys/ufs/ffs/ffs_snapshot.c

cvs diff -r1.105 -r1.106 src/sys/ufs/ffs/ffs_snapshot.c (expand / switch to unified diff)

--- src/sys/ufs/ffs/ffs_snapshot.c 2011/02/18 14:48:54 1.105
+++ src/sys/ufs/ffs/ffs_snapshot.c 2011/02/21 09:29:21 1.106
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: ffs_snapshot.c,v 1.105 2011/02/18 14:48:54 bouyer Exp $ */ 1/* $NetBSD: ffs_snapshot.c,v 1.106 2011/02/21 09:29:21 hannken Exp $ */
2 2
3/* 3/*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 * 5 *
6 * Further information about snapshots can be obtained from: 6 * Further information about snapshots can be obtained from:
7 * 7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick@mckusick.com 9 * 1614 Oxford Street mckusick@mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542 10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA 11 * USA
12 * 12 *
13 * Redistribution and use in source and binary forms, with or without 13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions 14 * modification, are permitted provided that the following conditions
@@ -28,27 +28,27 @@ @@ -28,27 +28,27 @@
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE. 33 * SUCH DAMAGE.
34 * 34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 * 36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */ 38 */
39 39
40#include <sys/cdefs.h> 40#include <sys/cdefs.h>
41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.105 2011/02/18 14:48:54 bouyer Exp $"); 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.106 2011/02/21 09:29:21 hannken Exp $");
42 42
43#if defined(_KERNEL_OPT) 43#if defined(_KERNEL_OPT)
44#include "opt_ffs.h" 44#include "opt_ffs.h"
45#endif 45#endif
46 46
47#include <sys/param.h> 47#include <sys/param.h>
48#include <sys/kernel.h> 48#include <sys/kernel.h>
49#include <sys/systm.h> 49#include <sys/systm.h>
50#include <sys/conf.h> 50#include <sys/conf.h>
51#include <sys/buf.h> 51#include <sys/buf.h>
52#include <sys/proc.h> 52#include <sys/proc.h>
53#include <sys/namei.h> 53#include <sys/namei.h>
54#include <sys/sched.h> 54#include <sys/sched.h>
@@ -69,26 +69,27 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot @@ -69,26 +69,27 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot
69#include <ufs/ufs/inode.h> 69#include <ufs/ufs/inode.h>
70#include <ufs/ufs/ufs_extern.h> 70#include <ufs/ufs/ufs_extern.h>
71#include <ufs/ufs/ufs_bswap.h> 71#include <ufs/ufs/ufs_bswap.h>
72#include <ufs/ufs/ufs_wapbl.h> 72#include <ufs/ufs/ufs_wapbl.h>
73 73
74#include <ufs/ffs/fs.h> 74#include <ufs/ffs/fs.h>
75#include <ufs/ffs/ffs_extern.h> 75#include <ufs/ffs/ffs_extern.h>
76 76
77#include <uvm/uvm.h> 77#include <uvm/uvm.h>
78 78
79struct snap_info { 79struct snap_info {
80 kmutex_t si_lock; /* Lock this snapinfo */ 80 kmutex_t si_lock; /* Lock this snapinfo */
81 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 81 kmutex_t si_snaplock; /* Snapshot vnode common lock */
 82 lwp_t *si_owner; /* Sanplock owner */
82 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 83 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
83 daddr_t *si_snapblklist; /* Snapshot block hints list */ 84 daddr_t *si_snapblklist; /* Snapshot block hints list */
84 uint32_t si_gen; /* Incremented on change */ 85 uint32_t si_gen; /* Incremented on change */
85}; 86};
86 87
87#if !defined(FFS_NO_SNAPSHOT) 88#if !defined(FFS_NO_SNAPSHOT)
88typedef int (*acctfunc_t) 89typedef int (*acctfunc_t)
89 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 90 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
90 91
91static int snapshot_setup(struct mount *, struct vnode *); 92static int snapshot_setup(struct mount *, struct vnode *);
92static int snapshot_copyfs(struct mount *, struct vnode *, void **); 93static int snapshot_copyfs(struct mount *, struct vnode *, void **);
93static int snapshot_expunge(struct mount *, struct vnode *, 94static int snapshot_expunge(struct mount *, struct vnode *,
94 struct fs *, daddr_t *, daddr_t **); 95 struct fs *, daddr_t *, daddr_t **);
@@ -130,26 +131,27 @@ static int snapdebug = 0; @@ -130,26 +131,27 @@ static int snapdebug = 0;
130 131
131int 132int
132ffs_snapshot_init(struct ufsmount *ump) 133ffs_snapshot_init(struct ufsmount *ump)
133{ 134{
134 struct snap_info *si; 135 struct snap_info *si;
135 136
136 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 137 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
137 if (si == NULL) 138 if (si == NULL)
138 return ENOMEM; 139 return ENOMEM;
139 140
140 TAILQ_INIT(&si->si_snapshots); 141 TAILQ_INIT(&si->si_snapshots);
141 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 142 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
142 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 143 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
 144 si->si_owner = NULL;
143 si->si_gen = 0; 145 si->si_gen = 0;
144 si->si_snapblklist = NULL; 146 si->si_snapblklist = NULL;
145 147
146 return 0; 148 return 0;
147} 149}
148 150
149void 151void
150ffs_snapshot_fini(struct ufsmount *ump) 152ffs_snapshot_fini(struct ufsmount *ump)
151{ 153{
152 struct snap_info *si; 154 struct snap_info *si;
153 155
154 si = ump->um_snapinfo; 156 si = ump->um_snapinfo;
155 ump->um_snapinfo = NULL; 157 ump->um_snapinfo = NULL;
@@ -163,27 +165,26 @@ ffs_snapshot_fini(struct ufsmount *ump) @@ -163,27 +165,26 @@ ffs_snapshot_fini(struct ufsmount *ump)
163 165
164/* 166/*
165 * Create a snapshot file and initialize it for the filesystem. 167 * Create a snapshot file and initialize it for the filesystem.
166 * Vnode is locked on entry and return. 168 * Vnode is locked on entry and return.
167 */ 169 */
168int 170int
169ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 171ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
170{ 172{
171#if defined(FFS_NO_SNAPSHOT) 173#if defined(FFS_NO_SNAPSHOT)
172 return EOPNOTSUPP; 174 return EOPNOTSUPP;
173} 175}
174#else /* defined(FFS_NO_SNAPSHOT) */ 176#else /* defined(FFS_NO_SNAPSHOT) */
175 bool suspended = false; 177 bool suspended = false;
176 bool snapshot_locked = false; 
177 int error, redo = 0, snaploc; 178 int error, redo = 0, snaploc;
178 void *sbbuf = NULL; 179 void *sbbuf = NULL;
179 daddr_t *snaplist = NULL, snaplistsize = 0; 180 daddr_t *snaplist = NULL, snaplistsize = 0;
180 struct buf *bp, *nbp; 181 struct buf *bp, *nbp;
181 struct fs *copy_fs = NULL; 182 struct fs *copy_fs = NULL;
182 struct fs *fs = VFSTOUFS(mp)->um_fs; 183 struct fs *fs = VFSTOUFS(mp)->um_fs;
183 struct inode *ip = VTOI(vp); 184 struct inode *ip = VTOI(vp);
184 struct lwp *l = curlwp; 185 struct lwp *l = curlwp;
185 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 186 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
186 struct timespec ts; 187 struct timespec ts;
187 struct timeval starttime; 188 struct timeval starttime;
188#ifdef DEBUG 189#ifdef DEBUG
189 struct timeval endtime; 190 struct timeval endtime;
@@ -259,31 +260,26 @@ ffs_snapshot(struct mount *mp, struct vn @@ -259,31 +260,26 @@ ffs_snapshot(struct mount *mp, struct vn
259 * Create a copy of the superblock and its summary information. 260 * Create a copy of the superblock and its summary information.
260 */ 261 */
261 error = snapshot_copyfs(mp, vp, &sbbuf); 262 error = snapshot_copyfs(mp, vp, &sbbuf);
262 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 263 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
263 if (error) 264 if (error)
264 goto out; 265 goto out;
265 /* 266 /*
266 * Expunge unlinked files from our view. 267 * Expunge unlinked files from our view.
267 */ 268 */
268 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 269 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
269 if (error) 270 if (error)
270 goto out; 271 goto out;
271 /* 272 /*
272 * Acquire the snapshot lock. 
273 */ 
274 mutex_enter(&si->si_snaplock); 
275 snapshot_locked = true; 
276 /* 
277 * Record snapshot inode. Since this is the newest snapshot, 273 * Record snapshot inode. Since this is the newest snapshot,
278 * it must be placed at the end of the list. 274 * it must be placed at the end of the list.
279 */ 275 */
280 fs->fs_snapinum[snaploc] = ip->i_number; 276 fs->fs_snapinum[snaploc] = ip->i_number;
281 277
282 mutex_enter(&si->si_lock); 278 mutex_enter(&si->si_lock);
283 if (is_active_snapshot(si, ip)) 279 if (is_active_snapshot(si, ip))
284 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 280 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
285 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 281 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
286 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 282 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
287 /* 283 /*
288 * If this is the first snapshot on this filesystem, put the 284 * If this is the first snapshot on this filesystem, put the
289 * preliminary list in place and establish the cow handler. 285 * preliminary list in place and establish the cow handler.
@@ -366,28 +362,26 @@ out: @@ -366,28 +362,26 @@ out:
366 } 362 }
367 if (error) { 363 if (error) {
368 fs->fs_snapinum[snaploc] = 0; 364 fs->fs_snapinum[snaploc] = 0;
369 } else { 365 } else {
370 /* 366 /*
371 * As this is the newest list, it is the most inclusive, so 367 * As this is the newest list, it is the most inclusive, so
372 * should replace the previous list. 368 * should replace the previous list.
373 */ 369 */
374 si->si_snapblklist = ip->i_snapblklist; 370 si->si_snapblklist = ip->i_snapblklist;
375 } 371 }
376 si->si_gen++; 372 si->si_gen++;
377 mutex_exit(&si->si_lock); 373 mutex_exit(&si->si_lock);
378 374
379 if (snapshot_locked) 
380 mutex_exit(&si->si_snaplock); 
381 if (suspended) { 375 if (suspended) {
382 vfs_resume(vp->v_mount); 376 vfs_resume(vp->v_mount);
383#ifdef DEBUG 377#ifdef DEBUG
384 getmicrotime(&endtime); 378 getmicrotime(&endtime);
385 timersub(&endtime, &starttime, &endtime); 379 timersub(&endtime, &starttime, &endtime);
386 printf("%s: suspended %lld.%03d sec, redo %d of %d\n", 380 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
387 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, 381 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
388 endtime.tv_usec / 1000, redo, fs->fs_ncg); 382 endtime.tv_usec / 1000, redo, fs->fs_ncg);
389#endif 383#endif
390 } 384 }
391 if (error) { 385 if (error) {
392 if (!UFS_WAPBL_BEGIN(mp)) { 386 if (!UFS_WAPBL_BEGIN(mp)) {
393 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 387 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
@@ -1344,101 +1338,102 @@ ffs_snapgone(struct inode *ip) @@ -1344,101 +1338,102 @@ ffs_snapgone(struct inode *ip)
1344 * Prepare a snapshot file for being removed. 1338 * Prepare a snapshot file for being removed.
1345 */ 1339 */
1346void 1340void
1347ffs_snapremove(struct vnode *vp) 1341ffs_snapremove(struct vnode *vp)
1348{ 1342{
1349 struct inode *ip = VTOI(vp), *xp; 1343 struct inode *ip = VTOI(vp), *xp;
1350 struct vnode *devvp = ip->i_devvp; 1344 struct vnode *devvp = ip->i_devvp;
1351 struct fs *fs = ip->i_fs; 1345 struct fs *fs = ip->i_fs;
1352 struct mount *mp = devvp->v_specmountpoint; 1346 struct mount *mp = devvp->v_specmountpoint;
1353 struct buf *ibp; 1347 struct buf *ibp;
1354 struct snap_info *si; 1348 struct snap_info *si;
1355 struct lwp *l = curlwp; 1349 struct lwp *l = curlwp;
1356 daddr_t numblks, blkno, dblk; 1350 daddr_t numblks, blkno, dblk;
1357 int error, loc, last, n; 1351 int error, loc, last;
1358 const int wbreak = blocks_in_journal(fs)/8; 
1359 1352
1360 si = VFSTOUFS(mp)->um_snapinfo; 1353 si = VFSTOUFS(mp)->um_snapinfo;
1361 /* 1354 /*
1362 * If active, delete from incore list (this snapshot may 1355 * If active, delete from incore list (this snapshot may
1363 * already have been in the process of being deleted, so 1356 * already have been in the process of being deleted, so
1364 * would not have been active). 1357 * would not have been active).
1365 * 1358 *
1366 * Clear copy-on-write flag if last snapshot. 1359 * Clear copy-on-write flag if last snapshot.
1367 */ 1360 */
 1361 mutex_enter(&si->si_snaplock);
1368 mutex_enter(&si->si_lock); 1362 mutex_enter(&si->si_lock);
1369 if (is_active_snapshot(si, ip)) { 1363 if (is_active_snapshot(si, ip)) {
1370 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1364 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1371 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1365 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1372 /* Roll back the list of preallocated blocks. */ 1366 /* Roll back the list of preallocated blocks. */
1373 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1367 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1374 si->si_snapblklist = xp->i_snapblklist; 1368 si->si_snapblklist = xp->i_snapblklist;
1375 si->si_gen++; 1369 si->si_gen++;
1376 mutex_exit(&si->si_lock); 1370 mutex_exit(&si->si_lock);
 1371 mutex_exit(&si->si_snaplock);
1377 } else { 1372 } else {
1378 si->si_snapblklist = 0; 1373 si->si_snapblklist = 0;
1379 si->si_gen++; 1374 si->si_gen++;
1380 mutex_exit(&si->si_lock); 1375 mutex_exit(&si->si_lock);
 1376 mutex_exit(&si->si_snaplock);
1381 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1377 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1382 } 1378 }
1383 if (ip->i_snapblklist != NULL) { 1379 if (ip->i_snapblklist != NULL) {
1384 free(ip->i_snapblklist, M_UFSMNT); 1380 free(ip->i_snapblklist, M_UFSMNT);
1385 ip->i_snapblklist = NULL; 1381 ip->i_snapblklist = NULL;
1386 } 1382 }
1387 } else 1383 } else {
1388 mutex_exit(&si->si_lock); 1384 mutex_exit(&si->si_lock);
 1385 mutex_exit(&si->si_snaplock);
 1386 }
1389 /* 1387 /*
1390 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1388 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1391 * snapshots that want them (see ffs_snapblkfree below). 1389 * snapshots that want them (see ffs_snapblkfree below).
1392 */ 1390 */
1393 for (blkno = 1; blkno < NDADDR; blkno++) { 1391 for (blkno = 1; blkno < NDADDR; blkno++) {
1394 dblk = db_get(ip, blkno); 1392 dblk = db_get(ip, blkno);
1395 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1393 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1396 db_assign(ip, blkno, 0); 1394 db_assign(ip, blkno, 0);
1397 else if ((dblk == blkstofrags(fs, blkno) && 1395 else if ((dblk == blkstofrags(fs, blkno) &&
1398 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1396 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1399 ip->i_number))) { 1397 ip->i_number))) {
1400 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1398 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1401 db_assign(ip, blkno, 0); 1399 db_assign(ip, blkno, 0);
1402 } 1400 }
1403 } 1401 }
1404 numblks = howmany(ip->i_size, fs->fs_bsize); 1402 numblks = howmany(ip->i_size, fs->fs_bsize);
1405 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) { 1403 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1406 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1404 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1407 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1405 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1408 if (error) 1406 if (error)
1409 continue; 1407 continue;
1410 if (fs->fs_size - blkno > NINDIR(fs)) 1408 if (fs->fs_size - blkno > NINDIR(fs))
1411 last = NINDIR(fs); 1409 last = NINDIR(fs);
1412 else 1410 else
1413 last = fs->fs_size - blkno; 1411 last = fs->fs_size - blkno;
1414 for (loc = 0; loc < last; loc++) { 1412 for (loc = 0; loc < last; loc++) {
1415 if (wbreak > 0 && (++n % wbreak) == 0) { 
1416 UFS_WAPBL_END(mp); 
1417 error = UFS_WAPBL_BEGIN(mp); 
1418 if (error) 
1419 panic("UFS_WAPBL_BEGIN failed"); 
1420 } 
1421 dblk = idb_get(ip, ibp->b_data, loc); 1413 dblk = idb_get(ip, ibp->b_data, loc);
1422 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1414 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1423 idb_assign(ip, ibp->b_data, loc, 0); 1415 idb_assign(ip, ibp->b_data, loc, 0);
1424 else if (dblk == blkstofrags(fs, blkno) && 1416 else if (dblk == blkstofrags(fs, blkno) &&
1425 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1417 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1426 fs->fs_bsize, ip->i_number)) { 1418 fs->fs_bsize, ip->i_number)) {
1427 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1419 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1428 idb_assign(ip, ibp->b_data, loc, 0); 1420 idb_assign(ip, ibp->b_data, loc, 0);
1429 } 1421 }
1430 } 1422 }
1431 bawrite(ibp); 1423 bawrite(ibp);
 1424 UFS_WAPBL_END(mp);
 1425 error = UFS_WAPBL_BEGIN(mp);
 1426 KASSERT(error == 0);
1432 } 1427 }
1433 /* 1428 /*
1434 * Clear snapshot flag and drop reference. 1429 * Clear snapshot flag and drop reference.
1435 */ 1430 */
1436 ip->i_flags &= ~SF_SNAPSHOT; 1431 ip->i_flags &= ~SF_SNAPSHOT;
1437 DIP_ASSIGN(ip, flags, ip->i_flags); 1432 DIP_ASSIGN(ip, flags, ip->i_flags);
1438 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1433 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1439} 1434}
1440 1435
1441/* 1436/*
1442 * Notification that a block is being freed. Return zero if the free 1437 * Notification that a block is being freed. Return zero if the free
1443 * should be allowed to proceed. Return non-zero if the snapshot file 1438 * should be allowed to proceed. Return non-zero if the snapshot file
1444 * wants to claim the block. The block will be claimed if it is an 1439 * wants to claim the block. The block will be claimed if it is an
@@ -1459,45 +1454,38 @@ ffs_snapremove(struct vnode *vp) @@ -1459,45 +1454,38 @@ ffs_snapremove(struct vnode *vp)
1459int 1454int
1460ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1455ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1461 long size, ino_t inum) 1456 long size, ino_t inum)
1462{ 1457{
1463 struct mount *mp = devvp->v_specmountpoint; 1458 struct mount *mp = devvp->v_specmountpoint;
1464 struct buf *ibp; 1459 struct buf *ibp;
1465 struct inode *ip; 1460 struct inode *ip;
1466 struct vnode *vp = NULL; 1461 struct vnode *vp = NULL;
1467 struct snap_info *si; 1462 struct snap_info *si;
1468 void *saved_data = NULL; 1463 void *saved_data = NULL;
1469 daddr_t lbn; 1464 daddr_t lbn;
1470 daddr_t blkno; 1465 daddr_t blkno;
1471 uint32_t gen; 1466 uint32_t gen;
1472 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1467 int indiroff = 0, error = 0, claimedblk = 0;
1473 1468
1474 si = VFSTOUFS(mp)->um_snapinfo; 1469 si = VFSTOUFS(mp)->um_snapinfo;
1475 lbn = fragstoblks(fs, bno); 1470 lbn = fragstoblks(fs, bno);
 1471 mutex_enter(&si->si_snaplock);
1476 mutex_enter(&si->si_lock); 1472 mutex_enter(&si->si_lock);
 1473 si->si_owner = curlwp;
 1474
1477retry: 1475retry:
1478 gen = si->si_gen; 1476 gen = si->si_gen;
1479 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1477 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1480 vp = ITOV(ip); 1478 vp = ITOV(ip);
1481 if (snapshot_locked == 0) { 
1482 if (!mutex_tryenter(&si->si_snaplock)) { 
1483 mutex_exit(&si->si_lock); 
1484 mutex_enter(&si->si_snaplock); 
1485 mutex_enter(&si->si_lock); 
1486 } 
1487 snapshot_locked = 1; 
1488 if (gen != si->si_gen) 
1489 goto retry; 
1490 } 
1491 /* 1479 /*
1492 * Lookup block being written. 1480 * Lookup block being written.
1493 */ 1481 */
1494 if (lbn < NDADDR) { 1482 if (lbn < NDADDR) {
1495 blkno = db_get(ip, lbn); 1483 blkno = db_get(ip, lbn);
1496 } else { 1484 } else {
1497 mutex_exit(&si->si_lock); 1485 mutex_exit(&si->si_lock);
1498 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1486 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1499 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1487 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1500 if (error) { 1488 if (error) {
1501 mutex_enter(&si->si_lock); 1489 mutex_enter(&si->si_lock);
1502 break; 1490 break;
1503 } 1491 }
@@ -1574,26 +1562,29 @@ retry: @@ -1574,26 +1562,29 @@ retry:
1574 } else { 1562 } else {
1575 idb_assign(ip, ibp->b_data, indiroff, bno); 1563 idb_assign(ip, ibp->b_data, indiroff, bno);
1576 if (ip->i_nlink > 0) 1564 if (ip->i_nlink > 0)
1577 bwrite(ibp); 1565 bwrite(ibp);
1578 else 1566 else
1579 bdwrite(ibp); 1567 bdwrite(ibp);
1580 } 1568 }
1581 DIP_ADD(ip, blocks, btodb(size)); 1569 DIP_ADD(ip, blocks, btodb(size));
1582 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1570 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1583 if (ip->i_nlink > 0 && mp->mnt_wapbl) 1571 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1584 error = syncsnap(vp); 1572 error = syncsnap(vp);
1585 else 1573 else
1586 error = 0; 1574 error = 0;
 1575 mutex_enter(&si->si_lock);
 1576 si->si_owner = NULL;
 1577 mutex_exit(&si->si_lock);
1587 mutex_exit(&si->si_snaplock); 1578 mutex_exit(&si->si_snaplock);
1588 return (error == 0); 1579 return (error == 0);
1589 } 1580 }
1590 if (lbn >= NDADDR) 1581 if (lbn >= NDADDR)
1591 brelse(ibp, 0); 1582 brelse(ibp, 0);
1592#ifdef DEBUG 1583#ifdef DEBUG
1593 if (snapdebug) 1584 if (snapdebug)
1594 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1585 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1595 "Copyonremove: snapino ", 1586 "Copyonremove: snapino ",
1596 (unsigned long long)ip->i_number, 1587 (unsigned long long)ip->i_number,
1597 lbn, "for inum", (unsigned long long)inum, size); 1588 lbn, "for inum", (unsigned long long)inum, size);
1598#endif 1589#endif
1599 /* 1590 /*
@@ -1613,37 +1604,37 @@ retry: @@ -1613,37 +1604,37 @@ retry:
1613 mutex_enter(&si->si_lock); 1604 mutex_enter(&si->si_lock);
1614 break; 1605 break;
1615 } 1606 }
1616 } 1607 }
1617 error = wrsnapblk(vp, saved_data, lbn); 1608 error = wrsnapblk(vp, saved_data, lbn);
1618 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1609 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1619 error = syncsnap(vp); 1610 error = syncsnap(vp);
1620 mutex_enter(&si->si_lock); 1611 mutex_enter(&si->si_lock);
1621 if (error) 1612 if (error)
1622 break; 1613 break;
1623 if (gen != si->si_gen) 1614 if (gen != si->si_gen)
1624 goto retry; 1615 goto retry;
1625 } 1616 }
 1617 si->si_owner = NULL;
1626 mutex_exit(&si->si_lock); 1618 mutex_exit(&si->si_lock);
 1619 mutex_exit(&si->si_snaplock);
1627 if (saved_data) 1620 if (saved_data)
1628 free(saved_data, M_UFSMNT); 1621 free(saved_data, M_UFSMNT);
1629 /* 1622 /*
1630 * If we have been unable to allocate a block in which to do 1623 * If we have been unable to allocate a block in which to do
1631 * the copy, then return non-zero so that the fragment will 1624 * the copy, then return non-zero so that the fragment will
1632 * not be freed. Although space will be lost, the snapshot 1625 * not be freed. Although space will be lost, the snapshot
1633 * will stay consistent. 1626 * will stay consistent.
1634 */ 1627 */
1635 if (snapshot_locked) 
1636 mutex_exit(&si->si_snaplock); 
1637 return (error); 1628 return (error);
1638} 1629}
1639 1630
1640/* 1631/*
1641 * Associate snapshot files when mounting. 1632 * Associate snapshot files when mounting.
1642 */ 1633 */
1643void 1634void
1644ffs_snapshot_mount(struct mount *mp) 1635ffs_snapshot_mount(struct mount *mp)
1645{ 1636{
1646 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1637 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1647 struct fs *fs = VFSTOUFS(mp)->um_fs; 1638 struct fs *fs = VFSTOUFS(mp)->um_fs;
1648 struct lwp *l = curlwp; 1639 struct lwp *l = curlwp;
1649 struct vnode *vp; 1640 struct vnode *vp;
@@ -1836,26 +1827,35 @@ ffs_copyonwrite(void *v, struct buf *bp, @@ -1836,26 +1827,35 @@ ffs_copyonwrite(void *v, struct buf *bp,
1836 break; 1827 break;
1837 if (snapblklist[mid] < lbn) 1828 if (snapblklist[mid] < lbn)
1838 lower = mid + 1; 1829 lower = mid + 1;
1839 else 1830 else
1840 upper = mid - 1; 1831 upper = mid - 1;
1841 } 1832 }
1842 if (lower <= upper) { 1833 if (lower <= upper) {
1843 mutex_exit(&si->si_lock); 1834 mutex_exit(&si->si_lock);
1844 return 0; 1835 return 0;
1845 } 1836 }
1846 /* 1837 /*
1847 * Not in the precomputed list, so check the snapshots. 1838 * Not in the precomputed list, so check the snapshots.
1848 */ 1839 */
 1840 if (si->si_owner != curlwp) {
 1841 if (!mutex_tryenter(&si->si_snaplock)) {
 1842 mutex_exit(&si->si_lock);
 1843 mutex_enter(&si->si_snaplock);
 1844 mutex_enter(&si->si_lock);
 1845 }
 1846 si->si_owner = curlwp;
 1847 snapshot_locked = 1;
 1848 }
1849 if (data_valid && bp->b_bcount == fs->fs_bsize) 1849 if (data_valid && bp->b_bcount == fs->fs_bsize)
1850 saved_data = bp->b_data; 1850 saved_data = bp->b_data;
1851retry: 1851retry:
1852 gen = si->si_gen; 1852 gen = si->si_gen;
1853 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1853 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1854 vp = ITOV(ip); 1854 vp = ITOV(ip);
1855 /* 1855 /*
1856 * We ensure that everything of our own that needs to be 1856 * We ensure that everything of our own that needs to be
1857 * copied will be done at the time that ffs_snapshot is 1857 * copied will be done at the time that ffs_snapshot is
1858 * called. Thus we can skip the check here which can 1858 * called. Thus we can skip the check here which can
1859 * deadlock in doing the lookup in ffs_balloc. 1859 * deadlock in doing the lookup in ffs_balloc.
1860 */ 1860 */
1861 if (bp->b_vp == vp) 1861 if (bp->b_vp == vp)
@@ -1876,54 +1876,28 @@ retry: @@ -1876,54 +1876,28 @@ retry:
1876 goto retry; 1876 goto retry;
1877 } 1877 }
1878#ifdef DIAGNOSTIC 1878#ifdef DIAGNOSTIC
1879 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1879 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1880 panic("ffs_copyonwrite: bad copy block"); 1880 panic("ffs_copyonwrite: bad copy block");
1881#endif 1881#endif
1882 if (blkno != 0) 1882 if (blkno != 0)
1883 continue; 1883 continue;
1884 1884
1885 if (curlwp == uvm.pagedaemon_lwp) { 1885 if (curlwp == uvm.pagedaemon_lwp) {
1886 error = ENOMEM; 1886 error = ENOMEM;
1887 break; 1887 break;
1888 } 1888 }
1889 1889 /* Only one level of recursion allowed. */
1890 if (snapshot_locked == 0) { 1890 KASSERT(snapshot_locked);
1891 if (!mutex_tryenter(&si->si_snaplock)) { 
1892 mutex_exit(&si->si_lock); 
1893 mutex_enter(&si->si_snaplock); 
1894 mutex_enter(&si->si_lock); 
1895 } 
1896 snapshot_locked = 1; 
1897 if (gen != si->si_gen) 
1898 goto retry; 
1899 
1900 /* Check again if block still needs to be copied */ 
1901 if (lbn < NDADDR) { 
1902 blkno = db_get(ip, lbn); 
1903 } else { 
1904 mutex_exit(&si->si_lock); 
1905 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 
1906 mutex_enter(&si->si_lock); 
1907 break; 
1908 } 
1909 mutex_enter(&si->si_lock); 
1910 if (gen != si->si_gen) 
1911 goto retry; 
1912 } 
1913 
1914 if (blkno != 0) 
1915 continue; 
1916 } 
1917 /* 1891 /*
1918 * Allocate the block into which to do the copy. Since 1892 * Allocate the block into which to do the copy. Since
1919 * multiple processes may all try to copy the same block, 1893 * multiple processes may all try to copy the same block,
1920 * we have to recheck our need to do a copy if we sleep 1894 * we have to recheck our need to do a copy if we sleep
1921 * waiting for the lock. 1895 * waiting for the lock.
1922 * 1896 *
1923 * Because all snapshots on a filesystem share a single 1897 * Because all snapshots on a filesystem share a single
1924 * lock, we ensure that we will never be in competition 1898 * lock, we ensure that we will never be in competition
1925 * with another process to allocate a block. 1899 * with another process to allocate a block.
1926 */ 1900 */
1927#ifdef DEBUG 1901#ifdef DEBUG
1928 if (snapdebug) { 1902 if (snapdebug) {
1929 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1903 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
@@ -1958,31 +1932,34 @@ retry: @@ -1958,31 +1932,34 @@ retry:
1958 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1932 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1959 error = syncsnap(vp); 1933 error = syncsnap(vp);
1960 mutex_enter(&si->si_lock); 1934 mutex_enter(&si->si_lock);
1961 if (error) 1935 if (error)
1962 break; 1936 break;
1963 if (gen != si->si_gen) 1937 if (gen != si->si_gen)
1964 goto retry; 1938 goto retry;
1965 } 1939 }
1966 /* 1940 /*
1967 * Note that we need to synchronously write snapshots that 1941 * Note that we need to synchronously write snapshots that
1968 * have not been unlinked, and hence will be visible after 1942 * have not been unlinked, and hence will be visible after
1969 * a crash, to ensure their integrity. 1943 * a crash, to ensure their integrity.
1970 */ 1944 */
1971 mutex_exit(&si->si_lock); 1945 if (snapshot_locked) {
 1946 si->si_owner = NULL;
 1947 mutex_exit(&si->si_lock);
 1948 mutex_exit(&si->si_snaplock);
 1949 } else
 1950 mutex_exit(&si->si_lock);
1972 if (saved_data && saved_data != bp->b_data) 1951 if (saved_data && saved_data != bp->b_data)
1973 free(saved_data, M_UFSMNT); 1952 free(saved_data, M_UFSMNT);
1974 if (snapshot_locked) 
1975 mutex_exit(&si->si_snaplock); 
1976 return error; 1953 return error;
1977} 1954}
1978 1955
1979/* 1956/*
1980 * Read from a snapshot. 1957 * Read from a snapshot.
1981 */ 1958 */
1982int 1959int
1983ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 1960ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
1984{ 1961{
1985 struct inode *ip = VTOI(vp); 1962 struct inode *ip = VTOI(vp);
1986 struct fs *fs = ip->i_fs; 1963 struct fs *fs = ip->i_fs;
1987 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 1964 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
1988 struct buf *bp; 1965 struct buf *bp;