Wed May 27 21:42:08 2009 UTC ()
Pull up following revision(s) (requested by hannken in ticket #781):
	sys/kern/kern_physio.c: revision 1.91
PR kern/39536: bufq related problem when writing DVDR and DVDRWs.
Remove a race where physio_done() may use memory already freed.
Observed by Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>.


(snj)
diff -r1.88 -r1.88.4.1 src/sys/kern/kern_physio.c

cvs diff -r1.88 -r1.88.4.1 src/sys/kern/kern_physio.c (switch to unified diff)

--- src/sys/kern/kern_physio.c 2008/09/24 08:19:19 1.88
+++ src/sys/kern/kern_physio.c 2009/05/27 21:42:08 1.88.4.1
@@ -1,432 +1,434 @@ @@ -1,432 +1,434 @@
1/* $NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $ */ 1/* $NetBSD: kern_physio.c,v 1.88.4.1 2009/05/27 21:42:08 snj Exp $ */
2 2
3/*- 3/*-
4 * Copyright (c) 1982, 1986, 1990, 1993 4 * Copyright (c) 1982, 1986, 1990, 1993
5 * The Regents of the University of California. All rights reserved. 5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc. 6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed 7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph 8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc. 10 * the permission of UNIX System Laboratories, Inc.
11 * 11 *
12 * Redistribution and use in source and binary forms, with or without 12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions 13 * modification, are permitted provided that the following conditions
14 * are met: 14 * are met:
15 * 1. Redistributions of source code must retain the above copyright 15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer. 16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright 17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the 18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution. 19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors 20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software 21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission. 22 * without specific prior written permission.
23 * 23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE. 34 * SUCH DAMAGE.
35 * 35 *
36 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 36 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
37 */ 37 */
38 38
39/*- 39/*-
40 * Copyright (c) 1994 Christopher G. Demetriou 40 * Copyright (c) 1994 Christopher G. Demetriou
41 * 41 *
42 * Redistribution and use in source and binary forms, with or without 42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions 43 * modification, are permitted provided that the following conditions
44 * are met: 44 * are met:
45 * 1. Redistributions of source code must retain the above copyright 45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer. 46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright 47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the 48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution. 49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software 50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement: 51 * must display the following acknowledgement:
52 * This product includes software developed by the University of 52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors. 53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors 54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software 55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission. 56 * without specific prior written permission.
57 * 57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE. 68 * SUCH DAMAGE.
69 * 69 *
70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 70 * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
71 */ 71 */
72 72
73#include <sys/cdefs.h> 73#include <sys/cdefs.h>
74__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $"); 74__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88.4.1 2009/05/27 21:42:08 snj Exp $");
75 75
76#include <sys/param.h> 76#include <sys/param.h>
77#include <sys/systm.h> 77#include <sys/systm.h>
78#include <sys/buf.h> 78#include <sys/buf.h>
79#include <sys/proc.h> 79#include <sys/proc.h>
80#include <sys/once.h> 80#include <sys/once.h>
81#include <sys/workqueue.h> 81#include <sys/workqueue.h>
82#include <sys/kmem.h> 82#include <sys/kmem.h>
83 83
84#include <uvm/uvm_extern.h> 84#include <uvm/uvm_extern.h>
85 85
86ONCE_DECL(physio_initialized); 86ONCE_DECL(physio_initialized);
87struct workqueue *physio_workqueue; 87struct workqueue *physio_workqueue;
88 88
89/* 89/*
90 * The routines implemented in this file are described in: 90 * The routines implemented in this file are described in:
91 * Leffler, et al.: The Design and Implementation of the 4.3BSD 91 * Leffler, et al.: The Design and Implementation of the 4.3BSD
92 * UNIX Operating System (Addison Welley, 1989) 92 * UNIX Operating System (Addison Welley, 1989)
93 * on pages 231-233. 93 * on pages 231-233.
94 */ 94 */
95 95
96/* #define PHYSIO_DEBUG */ 96/* #define PHYSIO_DEBUG */
97#if defined(PHYSIO_DEBUG) 97#if defined(PHYSIO_DEBUG)
98#define DPRINTF(a) printf a 98#define DPRINTF(a) printf a
99#else /* defined(PHYSIO_DEBUG) */ 99#else /* defined(PHYSIO_DEBUG) */
100#define DPRINTF(a) /* nothing */ 100#define DPRINTF(a) /* nothing */
101#endif /* defined(PHYSIO_DEBUG) */ 101#endif /* defined(PHYSIO_DEBUG) */
102 102
103struct physio_stat { 103struct physio_stat {
104 int ps_running; 104 int ps_running;
105 int ps_error; 105 int ps_error;
106 int ps_failed; 106 int ps_failed;
107 off_t ps_endoffset; 107 off_t ps_endoffset;
108 buf_t *ps_orig_bp; 108 buf_t *ps_orig_bp;
109 kmutex_t ps_lock; 109 kmutex_t ps_lock;
110 kcondvar_t ps_cv; 110 kcondvar_t ps_cv;
111}; 111};
112 112
113static void 113static void
114physio_done(struct work *wk, void *dummy) 114physio_done(struct work *wk, void *dummy)
115{ 115{
116 struct buf *bp = (void *)wk; 116 struct buf *bp = (void *)wk;
117 size_t todo = bp->b_bufsize; 117 size_t todo = bp->b_bufsize;
118 size_t done = bp->b_bcount - bp->b_resid; 118 size_t done = bp->b_bcount - bp->b_resid;
119 struct physio_stat *ps = bp->b_private; 119 struct physio_stat *ps = bp->b_private;
 120 bool is_iobuf;
120 121
121 KASSERT(&bp->b_work == wk); 122 KASSERT(&bp->b_work == wk);
122 KASSERT(bp->b_bcount <= todo); 123 KASSERT(bp->b_bcount <= todo);
123 KASSERT(bp->b_resid <= bp->b_bcount); 124 KASSERT(bp->b_resid <= bp->b_bcount);
124 KASSERT((bp->b_flags & B_PHYS) != 0); 125 KASSERT((bp->b_flags & B_PHYS) != 0);
125 KASSERT(dummy == NULL); 126 KASSERT(dummy == NULL);
126 127
127 vunmapbuf(bp, todo); 128 vunmapbuf(bp, todo);
128 uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); 129 uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
129 130
130 mutex_enter(&ps->ps_lock); 131 mutex_enter(&ps->ps_lock);
 132 is_iobuf = (bp != ps->ps_orig_bp);
131 if (__predict_false(done != todo)) { 133 if (__predict_false(done != todo)) {
132 off_t endoffset = dbtob(bp->b_blkno) + done; 134 off_t endoffset = dbtob(bp->b_blkno) + done;
133 135
134 /* 136 /*
135 * we got an error or hit EOM. 137 * we got an error or hit EOM.
136 * 138 *
137 * we only care about the first one. 139 * we only care about the first one.
138 * ie. the one at the lowest offset. 140 * ie. the one at the lowest offset.
139 */ 141 */
140 142
141 KASSERT(ps->ps_endoffset != endoffset); 143 KASSERT(ps->ps_endoffset != endoffset);
142 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 144 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
143 ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", 145 ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
144 __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, 146 __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
145 bp->b_blkno, bp->b_bcount, bp->b_flags)); 147 bp->b_blkno, bp->b_bcount, bp->b_flags));
146 148
147 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { 149 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
148 DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 150 DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
149 " -> %" PRIu64 "\n", 151 " -> %" PRIu64 "\n",
150 __func__, ps, 152 __func__, ps,
151 ps->ps_error, bp->b_error, 153 ps->ps_error, bp->b_error,
152 ps->ps_endoffset, endoffset)); 154 ps->ps_endoffset, endoffset));
153 155
154 ps->ps_endoffset = endoffset; 156 ps->ps_endoffset = endoffset;
155 ps->ps_error = bp->b_error; 157 ps->ps_error = bp->b_error;
156 } 158 }
157 ps->ps_failed++; 159 ps->ps_failed++;
158 } else { 160 } else {
159 KASSERT(bp->b_error == 0); 161 KASSERT(bp->b_error == 0);
160 } 162 }
161 163
162 ps->ps_running--; 164 ps->ps_running--;
163 cv_signal(&ps->ps_cv); 165 cv_signal(&ps->ps_cv);
164 mutex_exit(&ps->ps_lock); 166 mutex_exit(&ps->ps_lock);
165 167
166 if (bp != ps->ps_orig_bp) 168 if (is_iobuf)
167 putiobuf(bp); 169 putiobuf(bp);
168} 170}
169 171
170static void 172static void
171physio_biodone(struct buf *bp) 173physio_biodone(struct buf *bp)
172{ 174{
173#if defined(DIAGNOSTIC) 175#if defined(DIAGNOSTIC)
174 struct physio_stat *ps = bp->b_private; 176 struct physio_stat *ps = bp->b_private;
175 size_t todo = bp->b_bufsize; 177 size_t todo = bp->b_bufsize;
176 178
177 KASSERT(ps->ps_running > 0); 179 KASSERT(ps->ps_running > 0);
178 KASSERT(bp->b_bcount <= todo); 180 KASSERT(bp->b_bcount <= todo);
179 KASSERT(bp->b_resid <= bp->b_bcount); 181 KASSERT(bp->b_resid <= bp->b_bcount);
180#endif /* defined(DIAGNOSTIC) */ 182#endif /* defined(DIAGNOSTIC) */
181 183
182 workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); 184 workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
183} 185}
184 186
185static void 187static void
186physio_wait(struct physio_stat *ps, int n) 188physio_wait(struct physio_stat *ps, int n)
187{ 189{
188 190
189 KASSERT(mutex_owned(&ps->ps_lock)); 191 KASSERT(mutex_owned(&ps->ps_lock));
190 192
191 while (ps->ps_running > n) 193 while (ps->ps_running > n)
192 cv_wait(&ps->ps_cv, &ps->ps_lock); 194 cv_wait(&ps->ps_cv, &ps->ps_lock);
193} 195}
194 196
195static int 197static int
196physio_init(void) 198physio_init(void)
197{ 199{
198 int error; 200 int error;
199 201
200 KASSERT(physio_workqueue == NULL); 202 KASSERT(physio_workqueue == NULL);
201 203
202 error = workqueue_create(&physio_workqueue, "physiod", 204 error = workqueue_create(&physio_workqueue, "physiod",
203 physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); 205 physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
204 206
205 return error; 207 return error;
206} 208}
207 209
208#define PHYSIO_CONCURRENCY 16 /* XXX tune */ 210#define PHYSIO_CONCURRENCY 16 /* XXX tune */
209 211
210/* 212/*
211 * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly 213 * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
212 * from the raw device to user buffers, and bypasses the buffer cache. 214 * from the raw device to user buffers, and bypasses the buffer cache.
213 * 215 *
214 * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. 216 * Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
215 */ 217 */
216int 218int
217physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, 219physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
218 void (*min_phys)(struct buf *), struct uio *uio) 220 void (*min_phys)(struct buf *), struct uio *uio)
219{ 221{
220 struct iovec *iovp; 222 struct iovec *iovp;
221 struct lwp *l = curlwp; 223 struct lwp *l = curlwp;
222 struct proc *p = l->l_proc; 224 struct proc *p = l->l_proc;
223 int i, error; 225 int i, error;
224 struct buf *bp = NULL; 226 struct buf *bp = NULL;
225 struct physio_stat *ps; 227 struct physio_stat *ps;
226 int concurrency = PHYSIO_CONCURRENCY - 1; 228 int concurrency = PHYSIO_CONCURRENCY - 1;
227 229
228 error = RUN_ONCE(&physio_initialized, physio_init); 230 error = RUN_ONCE(&physio_initialized, physio_init);
229 if (__predict_false(error != 0)) { 231 if (__predict_false(error != 0)) {
230 return error; 232 return error;
231 } 233 }
232 234
233 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", 235 DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
234 __func__, uio->uio_offset, uio->uio_resid)); 236 __func__, uio->uio_offset, uio->uio_resid));
235 237
236 flags &= B_READ | B_WRITE; 238 flags &= B_READ | B_WRITE;
237 239
238 if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) 240 if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL)
239 return ENOMEM; 241 return ENOMEM;
240 /* ps->ps_running = 0; */ 242 /* ps->ps_running = 0; */
241 /* ps->ps_error = 0; */ 243 /* ps->ps_error = 0; */
242 /* ps->ps_failed = 0; */ 244 /* ps->ps_failed = 0; */
243 ps->ps_orig_bp = obp; 245 ps->ps_orig_bp = obp;
244 ps->ps_endoffset = -1; 246 ps->ps_endoffset = -1;
245 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); 247 mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
246 cv_init(&ps->ps_cv, "physio"); 248 cv_init(&ps->ps_cv, "physio");
247 249
248 /* Make sure we have a buffer, creating one if necessary. */ 250 /* Make sure we have a buffer, creating one if necessary. */
249 if (obp != NULL) { 251 if (obp != NULL) {
250 /* [raise the processor priority level to splbio;] */ 252 /* [raise the processor priority level to splbio;] */
251 mutex_enter(&bufcache_lock); 253 mutex_enter(&bufcache_lock);
252 /* Mark it busy, so nobody else will use it. */ 254 /* Mark it busy, so nobody else will use it. */
253 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) 255 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
254 ; 256 ;
255 mutex_exit(&bufcache_lock); 257 mutex_exit(&bufcache_lock);
256 concurrency = 0; /* see "XXXkludge" comment below */ 258 concurrency = 0; /* see "XXXkludge" comment below */
257 } 259 }
258 260
259 uvm_lwp_hold(l); 261 uvm_lwp_hold(l);
260 262
261 for (i = 0; i < uio->uio_iovcnt; i++) { 263 for (i = 0; i < uio->uio_iovcnt; i++) {
262 bool sync = true; 264 bool sync = true;
263 265
264 iovp = &uio->uio_iov[i]; 266 iovp = &uio->uio_iov[i];
265 while (iovp->iov_len > 0) { 267 while (iovp->iov_len > 0) {
266 size_t todo; 268 size_t todo;
267 vaddr_t endp; 269 vaddr_t endp;
268 270
269 mutex_enter(&ps->ps_lock); 271 mutex_enter(&ps->ps_lock);
270 if (ps->ps_failed != 0) { 272 if (ps->ps_failed != 0) {
271 goto done_locked; 273 goto done_locked;
272 } 274 }
273 physio_wait(ps, sync ? 0 : concurrency); 275 physio_wait(ps, sync ? 0 : concurrency);
274 mutex_exit(&ps->ps_lock); 276 mutex_exit(&ps->ps_lock);
275 if (obp != NULL) { 277 if (obp != NULL) {
276 /* 278 /*
277 * XXXkludge 279 * XXXkludge
278 * some drivers use "obp" as an identifier. 280 * some drivers use "obp" as an identifier.
279 */ 281 */
280 bp = obp; 282 bp = obp;
281 } else { 283 } else {
282 bp = getiobuf(NULL, true); 284 bp = getiobuf(NULL, true);
283 bp->b_cflags = BC_BUSY; 285 bp->b_cflags = BC_BUSY;
284 } 286 }
285 bp->b_dev = dev; 287 bp->b_dev = dev;
286 bp->b_proc = p; 288 bp->b_proc = p;
287 bp->b_private = ps; 289 bp->b_private = ps;
288 290
289 /* 291 /*
290 * [mark the buffer busy for physical I/O] 292 * [mark the buffer busy for physical I/O]
291 * (i.e. set B_PHYS (because it's an I/O to user 293 * (i.e. set B_PHYS (because it's an I/O to user
292 * memory, and B_RAW, because B_RAW is to be 294 * memory, and B_RAW, because B_RAW is to be
293 * "Set by physio for raw transfers.", in addition 295 * "Set by physio for raw transfers.", in addition
294 * to the "busy" and read/write flag.) 296 * to the "busy" and read/write flag.)
295 */ 297 */
296 bp->b_oflags = 0; 298 bp->b_oflags = 0;
297 bp->b_cflags = BC_BUSY; 299 bp->b_cflags = BC_BUSY;
298 bp->b_flags = flags | B_PHYS | B_RAW; 300 bp->b_flags = flags | B_PHYS | B_RAW;
299 bp->b_iodone = physio_biodone; 301 bp->b_iodone = physio_biodone;
300 302
301 /* [set up the buffer for a maximum-sized transfer] */ 303 /* [set up the buffer for a maximum-sized transfer] */
302 bp->b_blkno = btodb(uio->uio_offset); 304 bp->b_blkno = btodb(uio->uio_offset);
303 if (dbtob(bp->b_blkno) != uio->uio_offset) { 305 if (dbtob(bp->b_blkno) != uio->uio_offset) {
304 error = EINVAL; 306 error = EINVAL;
305 goto done; 307 goto done;
306 } 308 }
307 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); 309 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
308 bp->b_data = iovp->iov_base; 310 bp->b_data = iovp->iov_base;
309 311
310 /* 312 /*
311 * [call minphys to bound the transfer size] 313 * [call minphys to bound the transfer size]
312 * and remember the amount of data to transfer, 314 * and remember the amount of data to transfer,
313 * for later comparison. 315 * for later comparison.
314 */ 316 */
315 (*min_phys)(bp); 317 (*min_phys)(bp);
316 todo = bp->b_bufsize = bp->b_bcount; 318 todo = bp->b_bufsize = bp->b_bcount;
317#if defined(DIAGNOSTIC) 319#if defined(DIAGNOSTIC)
318 if (todo > MAXPHYS) 320 if (todo > MAXPHYS)
319 panic("todo(%zu) > MAXPHYS; minphys broken", 321 panic("todo(%zu) > MAXPHYS; minphys broken",
320 todo); 322 todo);
321#endif /* defined(DIAGNOSTIC) */ 323#endif /* defined(DIAGNOSTIC) */
322 324
323 sync = false; 325 sync = false;
324 endp = (vaddr_t)bp->b_data + todo; 326 endp = (vaddr_t)bp->b_data + todo;
325 if (trunc_page(endp) != endp) { 327 if (trunc_page(endp) != endp) {
326 /* 328 /*
327 * following requests can overlap. 329 * following requests can overlap.
328 * note that uvm_vslock does round_page. 330 * note that uvm_vslock does round_page.
329 */ 331 */
330 sync = true; 332 sync = true;
331 } 333 }
332 334
333 /* 335 /*
334 * [lock the part of the user address space involved 336 * [lock the part of the user address space involved
335 * in the transfer] 337 * in the transfer]
336 * Beware vmapbuf(); it clobbers b_data and 338 * Beware vmapbuf(); it clobbers b_data and
337 * saves it in b_saveaddr. However, vunmapbuf() 339 * saves it in b_saveaddr. However, vunmapbuf()
338 * restores it. 340 * restores it.
339 */ 341 */
340 error = uvm_vslock(p->p_vmspace, bp->b_data, todo, 342 error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
341 (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); 343 (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
342 if (error) { 344 if (error) {
343 goto done; 345 goto done;
344 } 346 }
345 vmapbuf(bp, todo); 347 vmapbuf(bp, todo);
346 348
347 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 349 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
348 350
349 mutex_enter(&ps->ps_lock); 351 mutex_enter(&ps->ps_lock);
350 ps->ps_running++; 352 ps->ps_running++;
351 mutex_exit(&ps->ps_lock); 353 mutex_exit(&ps->ps_lock);
352 354
353 /* [call strategy to start the transfer] */ 355 /* [call strategy to start the transfer] */
354 (*strategy)(bp); 356 (*strategy)(bp);
355 bp = NULL; 357 bp = NULL;
356 358
357 iovp->iov_len -= todo; 359 iovp->iov_len -= todo;
358 iovp->iov_base = (char *)iovp->iov_base + todo; 360 iovp->iov_base = (char *)iovp->iov_base + todo;
359 uio->uio_offset += todo; 361 uio->uio_offset += todo;
360 uio->uio_resid -= todo; 362 uio->uio_resid -= todo;
361 } 363 }
362 } 364 }
363 365
364done: 366done:
365 mutex_enter(&ps->ps_lock); 367 mutex_enter(&ps->ps_lock);
366done_locked: 368done_locked:
367 physio_wait(ps, 0); 369 physio_wait(ps, 0);
368 mutex_exit(&ps->ps_lock); 370 mutex_exit(&ps->ps_lock);
369 371
370 if (ps->ps_failed != 0) { 372 if (ps->ps_failed != 0) {
371 off_t delta; 373 off_t delta;
372 374
373 delta = uio->uio_offset - ps->ps_endoffset; 375 delta = uio->uio_offset - ps->ps_endoffset;
374 KASSERT(delta > 0); 376 KASSERT(delta > 0);
375 uio->uio_resid += delta; 377 uio->uio_resid += delta;
376 /* uio->uio_offset = ps->ps_endoffset; */ 378 /* uio->uio_offset = ps->ps_endoffset; */
377 } else { 379 } else {
378 KASSERT(ps->ps_endoffset == -1); 380 KASSERT(ps->ps_endoffset == -1);
379 } 381 }
380 if (bp != NULL && bp != obp) { 382 if (bp != NULL && bp != obp) {
381 putiobuf(bp); 383 putiobuf(bp);
382 } 384 }
383 if (error == 0) { 385 if (error == 0) {
384 error = ps->ps_error; 386 error = ps->ps_error;
385 } 387 }
386 mutex_destroy(&ps->ps_lock); 388 mutex_destroy(&ps->ps_lock);
387 cv_destroy(&ps->ps_cv); 389 cv_destroy(&ps->ps_cv);
388 kmem_free(ps, sizeof(*ps)); 390 kmem_free(ps, sizeof(*ps));
389 391
390 /* 392 /*
391 * [clean up the state of the buffer] 393 * [clean up the state of the buffer]
392 * Remember if somebody wants it, so we can wake them up below. 394 * Remember if somebody wants it, so we can wake them up below.
393 * Also, if we had to steal it, give it back. 395 * Also, if we had to steal it, give it back.
394 */ 396 */
395 if (obp != NULL) { 397 if (obp != NULL) {
396 KASSERT((obp->b_cflags & BC_BUSY) != 0); 398 KASSERT((obp->b_cflags & BC_BUSY) != 0);
397 399
398 /* 400 /*
399 * [if another process is waiting for the raw I/O buffer, 401 * [if another process is waiting for the raw I/O buffer,
400 * wake up processes waiting to do physical I/O; 402 * wake up processes waiting to do physical I/O;
401 */ 403 */
402 mutex_enter(&bufcache_lock); 404 mutex_enter(&bufcache_lock);
403 obp->b_cflags &= ~(BC_BUSY | BC_WANTED); 405 obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
404 obp->b_flags &= ~(B_PHYS | B_RAW); 406 obp->b_flags &= ~(B_PHYS | B_RAW);
405 obp->b_iodone = NULL; 407 obp->b_iodone = NULL;
406 cv_broadcast(&obp->b_busy); 408 cv_broadcast(&obp->b_busy);
407 mutex_exit(&bufcache_lock); 409 mutex_exit(&bufcache_lock);
408 } 410 }
409 uvm_lwp_rele(l); 411 uvm_lwp_rele(l);
410 412
411 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", 413 DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
412 __func__, uio->uio_offset, uio->uio_resid)); 414 __func__, uio->uio_offset, uio->uio_resid));
413 415
414 return error; 416 return error;
415} 417}
416 418
417/* 419/*
418 * Leffler, et al., says on p. 231: 420 * Leffler, et al., says on p. 231:
419 * "The minphys() routine is called by physio() to adjust the 421 * "The minphys() routine is called by physio() to adjust the
420 * size of each I/O transfer before the latter is passed to 422 * size of each I/O transfer before the latter is passed to
421 * the strategy routine..." 423 * the strategy routine..."
422 * 424 *
423 * so, just adjust the buffer's count accounting to MAXPHYS here, 425 * so, just adjust the buffer's count accounting to MAXPHYS here,
424 * and return the new count; 426 * and return the new count;
425 */ 427 */
426void 428void
427minphys(struct buf *bp) 429minphys(struct buf *bp)
428{ 430{
429 431
430 if (bp->b_bcount > MAXPHYS) 432 if (bp->b_bcount > MAXPHYS)
431 bp->b_bcount = MAXPHYS; 433 bp->b_bcount = MAXPHYS;
432} 434}