Wed Sep 24 08:19:19 2008 UTC ()
Resolve a race when physio_done signals completion before it tries to
free a buffer.  This will fail if the buffer owner has a chance to
modify the BC_DONTFREE flag before putphysbuf() examines it.

Fix by removing get/putphysbuf() and BC_DONTFREE.  Physio_done() now
has an explicit test for a buffer coming from the call of physio().

Observed by Lars Nordlund when writing a DVD with growisofs, see PR kern/39536.

Reviewed by: Jason Thorpe <thorpej@netbsd.org>


(hannken)
diff -r1.87 -r1.88 src/sys/kern/kern_physio.c

cvs diff -r1.87 -r1.88 src/sys/kern/kern_physio.c (expand / switch to unified diff)

--- src/sys/kern/kern_physio.c 2008/02/15 13:46:04 1.87
+++ src/sys/kern/kern_physio.c 2008/09/24 08:19:19 1.88
@@ -1,14 +1,14 @@ @@ -1,14 +1,14 @@
1/* $NetBSD: kern_physio.c,v 1.87 2008/02/15 13:46:04 ad Exp $ */ 1/* $NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1982, 1986, 1990, 1993 4 * Copyright (c) 1982, 1986, 1990, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
@@ -61,103 +61,65 @@ @@ -61,103 +61,65 @@
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE. 68 * SUCH DAMAGE.
69 * 69 *
70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
71 */ 71 */
72 72
73#include <sys/cdefs.h> 73#include <sys/cdefs.h>
74__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.87 2008/02/15 13:46:04 ad Exp $"); 74__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $");
75 75
76#include <sys/param.h> 76#include <sys/param.h>
77#include <sys/systm.h> 77#include <sys/systm.h>
78#include <sys/buf.h> 78#include <sys/buf.h>
79#include <sys/proc.h> 79#include <sys/proc.h>
80#include <sys/once.h> 80#include <sys/once.h>
81#include <sys/workqueue.h> 81#include <sys/workqueue.h>
82#include <sys/kmem.h> 82#include <sys/kmem.h>
83 83
84#include <uvm/uvm_extern.h> 84#include <uvm/uvm_extern.h>
85 85
86ONCE_DECL(physio_initialized); 86ONCE_DECL(physio_initialized);
87struct workqueue *physio_workqueue; 87struct workqueue *physio_workqueue;
88 88
89/* 89/*
90 * The routines implemented in this file are described in: 90 * The routines implemented in this file are described in:
91 * Leffler, et al.: The Design and Implementation of the 4.3BSD 91 * Leffler, et al.: The Design and Implementation of the 4.3BSD
92 * UNIX Operating System (Addison Welley, 1989) 92 * UNIX Operating System (Addison Welley, 1989)
93 * on pages 231-233. 93 * on pages 231-233.
94 * 
95 * The routines "getphysbuf" and "putphysbuf" steal and return a swap 
96 * buffer. Leffler, et al., says that swap buffers are used to do the 
97 * I/O, so raw I/O requests don't have to be single-threaded. Of course, 
98 * NetBSD doesn't use "swap buffers" -- we have our own memory pool for 
99 * buffer descriptors. 
100 */ 94 */
101 95
102/* #define PHYSIO_DEBUG */ 96/* #define PHYSIO_DEBUG */
103#if defined(PHYSIO_DEBUG) 97#if defined(PHYSIO_DEBUG)
104#define DPRINTF(a) printf a 98#define DPRINTF(a) printf a
105#else /* defined(PHYSIO_DEBUG) */ 99#else /* defined(PHYSIO_DEBUG) */
106#define DPRINTF(a) /* nothing */ 100#define DPRINTF(a) /* nothing */
107#endif /* defined(PHYSIO_DEBUG) */ 101#endif /* defined(PHYSIO_DEBUG) */
108 102
109struct physio_stat { 103struct physio_stat {
110 int ps_running; 104 int ps_running;
111 int ps_error; 105 int ps_error;
112 int ps_failed; 106 int ps_failed;
113 off_t ps_endoffset; 107 off_t ps_endoffset;
 108 buf_t *ps_orig_bp;
114 kmutex_t ps_lock; 109 kmutex_t ps_lock;
115 kcondvar_t ps_cv; 110 kcondvar_t ps_cv;
116}; 111};
117 112
118/* abuse these flags of struct buf */ 
119#define BC_DONTFREE BC_AGE 
120 
121/* 
122 * allocate a buffer structure for use in physical I/O. 
123 */ 
124static struct buf * 
125getphysbuf(void) 
126{ 
127 struct buf *bp; 
128 
129 bp = getiobuf(NULL, true); 
130 bp->b_error = 0; 
131 bp->b_cflags = BC_BUSY; 
132 return(bp); 
133} 
134 
135/* 
136 * get rid of a swap buffer structure which has been used in physical I/O. 
137 */ 
138static void 
139putphysbuf(struct buf *bp) 
140{ 
141 
142 if ((bp->b_cflags & BC_DONTFREE) != 0) { 
143 return; 
144 } 
145 
146 if (__predict_false(bp->b_cflags & BC_WANTED)) 
147 panic("putphysbuf: private buf BC_WANTED"); 
148 putiobuf(bp); 
149} 
150 
151static void 113static void
152physio_done(struct work *wk, void *dummy) 114physio_done(struct work *wk, void *dummy)
153{ 115{
154 struct buf *bp = (void *)wk; 116 struct buf *bp = (void *)wk;
155 size_t todo = bp->b_bufsize; 117 size_t todo = bp->b_bufsize;
156 size_t done = bp->b_bcount - bp->b_resid; 118 size_t done = bp->b_bcount - bp->b_resid;
157 struct physio_stat *ps = bp->b_private; 119 struct physio_stat *ps = bp->b_private;
158 120
159 KASSERT(&bp->b_work == wk); 121 KASSERT(&bp->b_work == wk);
160 KASSERT(bp->b_bcount <= todo); 122 KASSERT(bp->b_bcount <= todo);
161 KASSERT(bp->b_resid <= bp->b_bcount); 123 KASSERT(bp->b_resid <= bp->b_bcount);
162 KASSERT((bp->b_flags & B_PHYS) != 0); 124 KASSERT((bp->b_flags & B_PHYS) != 0);
163 KASSERT(dummy == NULL); 125 KASSERT(dummy == NULL);
@@ -191,27 +153,28 @@ physio_done(struct work *wk, void *dummy @@ -191,27 +153,28 @@ physio_done(struct work *wk, void *dummy
191 153
192 ps->ps_endoffset = endoffset; 154 ps->ps_endoffset = endoffset;
193 ps->ps_error = bp->b_error; 155 ps->ps_error = bp->b_error;
194 } 156 }
195 ps->ps_failed++; 157 ps->ps_failed++;
196 } else { 158 } else {
197 KASSERT(bp->b_error == 0); 159 KASSERT(bp->b_error == 0);
198 } 160 }
199 161
200 ps->ps_running--; 162 ps->ps_running--;
201 cv_signal(&ps->ps_cv); 163 cv_signal(&ps->ps_cv);
202 mutex_exit(&ps->ps_lock); 164 mutex_exit(&ps->ps_lock);
203 165
204 putphysbuf(bp); 166 if (bp != ps->ps_orig_bp)
 167 putiobuf(bp);
205} 168}
206 169
207static void 170static void
208physio_biodone(struct buf *bp) 171physio_biodone(struct buf *bp)
209{ 172{
210#if defined(DIAGNOSTIC) 173#if defined(DIAGNOSTIC)
211 struct physio_stat *ps = bp->b_private; 174 struct physio_stat *ps = bp->b_private;
212 size_t todo = bp->b_bufsize; 175 size_t todo = bp->b_bufsize;
213 176
214 KASSERT(ps->ps_running > 0); 177 KASSERT(ps->ps_running > 0);
215 KASSERT(bp->b_bcount <= todo); 178 KASSERT(bp->b_bcount <= todo);
216 KASSERT(bp->b_resid <= bp->b_bcount); 179 KASSERT(bp->b_resid <= bp->b_bcount);
217#endif /* defined(DIAGNOSTIC) */ 180#endif /* defined(DIAGNOSTIC) */
@@ -267,38 +230,38 @@ physio(void (*strategy)(struct buf *), s @@ -267,38 +230,38 @@ physio(void (*strategy)(struct buf *), s
267 return error; 230 return error;
268 } 231 }
269 232
270 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", 233 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
271 __func__, uio->uio_offset, uio->uio_resid)); 234 __func__, uio->uio_offset, uio->uio_resid));
272 235
273 flags &= B_READ | B_WRITE; 236 flags &= B_READ | B_WRITE;
274 237
275 if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) 238 if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL)
276 return ENOMEM; 239 return ENOMEM;
277 /* ps->ps_running = 0; */ 240 /* ps->ps_running = 0; */
278 /* ps->ps_error = 0; */ 241 /* ps->ps_error = 0; */
279 /* ps->ps_failed = 0; */ 242 /* ps->ps_failed = 0; */
 243 ps->ps_orig_bp = obp;
280 ps->ps_endoffset = -1; 244 ps->ps_endoffset = -1;
281 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); 245 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
282 cv_init(&ps->ps_cv, "physio"); 246 cv_init(&ps->ps_cv, "physio");
283 247
284 /* Make sure we have a buffer, creating one if necessary. */ 248 /* Make sure we have a buffer, creating one if necessary. */
285 if (obp != NULL) { 249 if (obp != NULL) {
286 /* [raise the processor priority level to splbio;] */ 250 /* [raise the processor priority level to splbio;] */
287 mutex_enter(&bufcache_lock); 251 mutex_enter(&bufcache_lock);
 252 /* Mark it busy, so nobody else will use it. */
288 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) 253 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
289 ; 254 ;
290 /* Mark it busy, so nobody else will use it. */ 
291 obp->b_cflags |= BC_DONTFREE; 
292 mutex_exit(&bufcache_lock); 255 mutex_exit(&bufcache_lock);
293 concurrency = 0; /* see "XXXkludge" comment below */ 256 concurrency = 0; /* see "XXXkludge" comment below */
294 } 257 }
295 258
296 uvm_lwp_hold(l); 259 uvm_lwp_hold(l);
297 260
298 for (i = 0; i < uio->uio_iovcnt; i++) { 261 for (i = 0; i < uio->uio_iovcnt; i++) {
299 bool sync = true; 262 bool sync = true;
300 263
301 iovp = &uio->uio_iov[i]; 264 iovp = &uio->uio_iov[i];
302 while (iovp->iov_len > 0) { 265 while (iovp->iov_len > 0) {
303 size_t todo; 266 size_t todo;
304 vaddr_t endp; 267 vaddr_t endp;
@@ -306,41 +269,42 @@ physio(void (*strategy)(struct buf *), s @@ -306,41 +269,42 @@ physio(void (*strategy)(struct buf *), s
306 mutex_enter(&ps->ps_lock); 269 mutex_enter(&ps->ps_lock);
307 if (ps->ps_failed != 0) { 270 if (ps->ps_failed != 0) {
308 goto done_locked; 271 goto done_locked;
309 } 272 }
310 physio_wait(ps, sync ? 0 : concurrency); 273 physio_wait(ps, sync ? 0 : concurrency);
311 mutex_exit(&ps->ps_lock); 274 mutex_exit(&ps->ps_lock);
312 if (obp != NULL) { 275 if (obp != NULL) {
313 /* 276 /*
314 * XXXkludge 277 * XXXkludge
315 * some drivers use "obp" as an identifier. 278 * some drivers use "obp" as an identifier.
316 */ 279 */
317 bp = obp; 280 bp = obp;
318 } else { 281 } else {
319 bp = getphysbuf(); 282 bp = getiobuf(NULL, true);
 283 bp->b_cflags = BC_BUSY;
320 } 284 }
321 bp->b_dev = dev; 285 bp->b_dev = dev;
322 bp->b_proc = p; 286 bp->b_proc = p;
323 bp->b_private = ps; 287 bp->b_private = ps;
324 288
325 /* 289 /*
326 * [mark the buffer busy for physical I/O] 290 * [mark the buffer busy for physical I/O]
327 * (i.e. set B_PHYS (because it's an I/O to user 291 * (i.e. set B_PHYS (because it's an I/O to user
328 * memory, and B_RAW, because B_RAW is to be 292 * memory, and B_RAW, because B_RAW is to be
329 * "Set by physio for raw transfers.", in addition 293 * "Set by physio for raw transfers.", in addition
330 * to the "busy" and read/write flag.) 294 * to the "busy" and read/write flag.)
331 */ 295 */
332 bp->b_oflags = 0; 296 bp->b_oflags = 0;
333 bp->b_cflags = (bp->b_cflags & BC_DONTFREE) | BC_BUSY; 297 bp->b_cflags = BC_BUSY;
334 bp->b_flags = flags | B_PHYS | B_RAW; 298 bp->b_flags = flags | B_PHYS | B_RAW;
335 bp->b_iodone = physio_biodone; 299 bp->b_iodone = physio_biodone;
336 300
337 /* [set up the buffer for a maximum-sized transfer] */ 301 /* [set up the buffer for a maximum-sized transfer] */
338 bp->b_blkno = btodb(uio->uio_offset); 302 bp->b_blkno = btodb(uio->uio_offset);
339 if (dbtob(bp->b_blkno) != uio->uio_offset) { 303 if (dbtob(bp->b_blkno) != uio->uio_offset) {
340 error = EINVAL; 304 error = EINVAL;
341 goto done; 305 goto done;
342 } 306 }
343 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); 307 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
344 bp->b_data = iovp->iov_base; 308 bp->b_data = iovp->iov_base;
345 309
346 /* 310 /*
@@ -403,51 +367,50 @@ done_locked: @@ -403,51 +367,50 @@ done_locked:
403 physio_wait(ps, 0); 367 physio_wait(ps, 0);
404 mutex_exit(&ps->ps_lock); 368 mutex_exit(&ps->ps_lock);
405 369
406 if (ps->ps_failed != 0) { 370 if (ps->ps_failed != 0) {
407 off_t delta; 371 off_t delta;
408 372
409 delta = uio->uio_offset - ps->ps_endoffset; 373 delta = uio->uio_offset - ps->ps_endoffset;
410 KASSERT(delta > 0); 374 KASSERT(delta > 0);
411 uio->uio_resid += delta; 375 uio->uio_resid += delta;
412 /* uio->uio_offset = ps->ps_endoffset; */ 376 /* uio->uio_offset = ps->ps_endoffset; */
413 } else { 377 } else {
414 KASSERT(ps->ps_endoffset == -1); 378 KASSERT(ps->ps_endoffset == -1);
415 } 379 }
416 if (bp != NULL) { 380 if (bp != NULL && bp != obp) {
417 putphysbuf(bp); 381 putiobuf(bp);
418 } 382 }
419 if (error == 0) { 383 if (error == 0) {
420 error = ps->ps_error; 384 error = ps->ps_error;
421 } 385 }
422 mutex_destroy(&ps->ps_lock); 386 mutex_destroy(&ps->ps_lock);
423 cv_destroy(&ps->ps_cv); 387 cv_destroy(&ps->ps_cv);
424 kmem_free(ps, sizeof(*ps)); 388 kmem_free(ps, sizeof(*ps));
425 389
426 /* 390 /*
427 * [clean up the state of the buffer] 391 * [clean up the state of the buffer]
428 * Remember if somebody wants it, so we can wake them up below. 392 * Remember if somebody wants it, so we can wake them up below.
429 * Also, if we had to steal it, give it back. 393 * Also, if we had to steal it, give it back.
430 */ 394 */
431 if (obp != NULL) { 395 if (obp != NULL) {
432 KASSERT((obp->b_cflags & BC_BUSY) != 0); 396 KASSERT((obp->b_cflags & BC_BUSY) != 0);
433 KASSERT((obp->b_cflags & BC_DONTFREE) != 0); 
434 397
435 /* 398 /*
436 * [if another process is waiting for the raw I/O buffer, 399 * [if another process is waiting for the raw I/O buffer,
437 * wake up processes waiting to do physical I/O; 400 * wake up processes waiting to do physical I/O;
438 */ 401 */
439 mutex_enter(&bufcache_lock); 402 mutex_enter(&bufcache_lock);
440 obp->b_cflags &= ~(BC_DONTFREE | BC_BUSY | BC_WANTED); 403 obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
441 obp->b_flags &= ~(B_PHYS | B_RAW); 404 obp->b_flags &= ~(B_PHYS | B_RAW);
442 obp->b_iodone = NULL; 405 obp->b_iodone = NULL;
443 cv_broadcast(&obp->b_busy); 406 cv_broadcast(&obp->b_busy);
444 mutex_exit(&bufcache_lock); 407 mutex_exit(&bufcache_lock);
445 } 408 }
446 uvm_lwp_rele(l); 409 uvm_lwp_rele(l);
447 410
448 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", 411 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
449 __func__, uio->uio_offset, uio->uio_resid)); 412 __func__, uio->uio_offset, uio->uio_resid));
450 413
451 return error; 414 return error;
452} 415}
453 416