| @@ -1,14 +1,14 @@ | | | @@ -1,14 +1,14 @@ |
1 | /* $NetBSD: kern_physio.c,v 1.87 2008/02/15 13:46:04 ad Exp $ */ | | 1 | /* $NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 1982, 1986, 1990, 1993 | | 4 | * Copyright (c) 1982, 1986, 1990, 1993 |
5 | * The Regents of the University of California. All rights reserved. | | 5 | * The Regents of the University of California. All rights reserved. |
6 | * (c) UNIX System Laboratories, Inc. | | 6 | * (c) UNIX System Laboratories, Inc. |
7 | * All or some portions of this file are derived from material licensed | | 7 | * All or some portions of this file are derived from material licensed |
8 | * to the University of California by American Telephone and Telegraph | | 8 | * to the University of California by American Telephone and Telegraph |
9 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with | | 9 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
10 | * the permission of UNIX System Laboratories, Inc. | | 10 | * the permission of UNIX System Laboratories, Inc. |
11 | * | | 11 | * |
12 | * Redistribution and use in source and binary forms, with or without | | 12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions | | 13 | * modification, are permitted provided that the following conditions |
14 | * are met: | | 14 | * are met: |
| @@ -61,103 +61,65 @@ | | | @@ -61,103 +61,65 @@ |
61 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 61 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
62 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 62 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
63 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 63 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
64 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 64 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
65 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 65 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
66 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 66 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
67 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 67 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
68 | * SUCH DAMAGE. | | 68 | * SUCH DAMAGE. |
69 | * | | 69 | * |
70 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 | | 70 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 |
71 | */ | | 71 | */ |
72 | | | 72 | |
73 | #include <sys/cdefs.h> | | 73 | #include <sys/cdefs.h> |
74 | __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.87 2008/02/15 13:46:04 ad Exp $"); | | 74 | __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $"); |
75 | | | 75 | |
76 | #include <sys/param.h> | | 76 | #include <sys/param.h> |
77 | #include <sys/systm.h> | | 77 | #include <sys/systm.h> |
78 | #include <sys/buf.h> | | 78 | #include <sys/buf.h> |
79 | #include <sys/proc.h> | | 79 | #include <sys/proc.h> |
80 | #include <sys/once.h> | | 80 | #include <sys/once.h> |
81 | #include <sys/workqueue.h> | | 81 | #include <sys/workqueue.h> |
82 | #include <sys/kmem.h> | | 82 | #include <sys/kmem.h> |
83 | | | 83 | |
84 | #include <uvm/uvm_extern.h> | | 84 | #include <uvm/uvm_extern.h> |
85 | | | 85 | |
86 | ONCE_DECL(physio_initialized); | | 86 | ONCE_DECL(physio_initialized); |
87 | struct workqueue *physio_workqueue; | | 87 | struct workqueue *physio_workqueue; |
88 | | | 88 | |
89 | /* | | 89 | /* |
90 | * The routines implemented in this file are described in: | | 90 | * The routines implemented in this file are described in: |
91 | * Leffler, et al.: The Design and Implementation of the 4.3BSD | | 91 | * Leffler, et al.: The Design and Implementation of the 4.3BSD |
92 | * UNIX Operating System (Addison Welley, 1989) | | 92 | * UNIX Operating System (Addison Welley, 1989) |
93 | * on pages 231-233. | | 93 | * on pages 231-233. |
94 | * | | | |
95 | * The routines "getphysbuf" and "putphysbuf" steal and return a swap | | | |
96 | * buffer. Leffler, et al., says that swap buffers are used to do the | | | |
97 | * I/O, so raw I/O requests don't have to be single-threaded. Of course, | | | |
98 | * NetBSD doesn't use "swap buffers" -- we have our own memory pool for | | | |
99 | * buffer descriptors. | | | |
100 | */ | | 94 | */ |
101 | | | 95 | |
102 | /* #define PHYSIO_DEBUG */ | | 96 | /* #define PHYSIO_DEBUG */ |
103 | #if defined(PHYSIO_DEBUG) | | 97 | #if defined(PHYSIO_DEBUG) |
104 | #define DPRINTF(a) printf a | | 98 | #define DPRINTF(a) printf a |
105 | #else /* defined(PHYSIO_DEBUG) */ | | 99 | #else /* defined(PHYSIO_DEBUG) */ |
106 | #define DPRINTF(a) /* nothing */ | | 100 | #define DPRINTF(a) /* nothing */ |
107 | #endif /* defined(PHYSIO_DEBUG) */ | | 101 | #endif /* defined(PHYSIO_DEBUG) */ |
108 | | | 102 | |
109 | struct physio_stat { | | 103 | struct physio_stat { |
110 | int ps_running; | | 104 | int ps_running; |
111 | int ps_error; | | 105 | int ps_error; |
112 | int ps_failed; | | 106 | int ps_failed; |
113 | off_t ps_endoffset; | | 107 | off_t ps_endoffset; |
| | | 108 | buf_t *ps_orig_bp; |
114 | kmutex_t ps_lock; | | 109 | kmutex_t ps_lock; |
115 | kcondvar_t ps_cv; | | 110 | kcondvar_t ps_cv; |
116 | }; | | 111 | }; |
117 | | | 112 | |
118 | /* abuse these flags of struct buf */ | | | |
119 | #define BC_DONTFREE BC_AGE | | | |
120 | | | | |
121 | /* | | | |
122 | * allocate a buffer structure for use in physical I/O. | | | |
123 | */ | | | |
124 | static struct buf * | | | |
125 | getphysbuf(void) | | | |
126 | { | | | |
127 | struct buf *bp; | | | |
128 | | | | |
129 | bp = getiobuf(NULL, true); | | | |
130 | bp->b_error = 0; | | | |
131 | bp->b_cflags = BC_BUSY; | | | |
132 | return(bp); | | | |
133 | } | | | |
134 | | | | |
135 | /* | | | |
136 | * get rid of a swap buffer structure which has been used in physical I/O. | | | |
137 | */ | | | |
138 | static void | | | |
139 | putphysbuf(struct buf *bp) | | | |
140 | { | | | |
141 | | | | |
142 | if ((bp->b_cflags & BC_DONTFREE) != 0) { | | | |
143 | return; | | | |
144 | } | | | |
145 | | | | |
146 | if (__predict_false(bp->b_cflags & BC_WANTED)) | | | |
147 | panic("putphysbuf: private buf BC_WANTED"); | | | |
148 | putiobuf(bp); | | | |
149 | } | | | |
150 | | | | |
151 | static void | | 113 | static void |
152 | physio_done(struct work *wk, void *dummy) | | 114 | physio_done(struct work *wk, void *dummy) |
153 | { | | 115 | { |
154 | struct buf *bp = (void *)wk; | | 116 | struct buf *bp = (void *)wk; |
155 | size_t todo = bp->b_bufsize; | | 117 | size_t todo = bp->b_bufsize; |
156 | size_t done = bp->b_bcount - bp->b_resid; | | 118 | size_t done = bp->b_bcount - bp->b_resid; |
157 | struct physio_stat *ps = bp->b_private; | | 119 | struct physio_stat *ps = bp->b_private; |
158 | | | 120 | |
159 | KASSERT(&bp->b_work == wk); | | 121 | KASSERT(&bp->b_work == wk); |
160 | KASSERT(bp->b_bcount <= todo); | | 122 | KASSERT(bp->b_bcount <= todo); |
161 | KASSERT(bp->b_resid <= bp->b_bcount); | | 123 | KASSERT(bp->b_resid <= bp->b_bcount); |
162 | KASSERT((bp->b_flags & B_PHYS) != 0); | | 124 | KASSERT((bp->b_flags & B_PHYS) != 0); |
163 | KASSERT(dummy == NULL); | | 125 | KASSERT(dummy == NULL); |
| @@ -191,27 +153,28 @@ physio_done(struct work *wk, void *dummy | | | @@ -191,27 +153,28 @@ physio_done(struct work *wk, void *dummy |
191 | | | 153 | |
192 | ps->ps_endoffset = endoffset; | | 154 | ps->ps_endoffset = endoffset; |
193 | ps->ps_error = bp->b_error; | | 155 | ps->ps_error = bp->b_error; |
194 | } | | 156 | } |
195 | ps->ps_failed++; | | 157 | ps->ps_failed++; |
196 | } else { | | 158 | } else { |
197 | KASSERT(bp->b_error == 0); | | 159 | KASSERT(bp->b_error == 0); |
198 | } | | 160 | } |
199 | | | 161 | |
200 | ps->ps_running--; | | 162 | ps->ps_running--; |
201 | cv_signal(&ps->ps_cv); | | 163 | cv_signal(&ps->ps_cv); |
202 | mutex_exit(&ps->ps_lock); | | 164 | mutex_exit(&ps->ps_lock); |
203 | | | 165 | |
204 | putphysbuf(bp); | | 166 | if (bp != ps->ps_orig_bp) |
| | | 167 | putiobuf(bp); |
205 | } | | 168 | } |
206 | | | 169 | |
207 | static void | | 170 | static void |
208 | physio_biodone(struct buf *bp) | | 171 | physio_biodone(struct buf *bp) |
209 | { | | 172 | { |
210 | #if defined(DIAGNOSTIC) | | 173 | #if defined(DIAGNOSTIC) |
211 | struct physio_stat *ps = bp->b_private; | | 174 | struct physio_stat *ps = bp->b_private; |
212 | size_t todo = bp->b_bufsize; | | 175 | size_t todo = bp->b_bufsize; |
213 | | | 176 | |
214 | KASSERT(ps->ps_running > 0); | | 177 | KASSERT(ps->ps_running > 0); |
215 | KASSERT(bp->b_bcount <= todo); | | 178 | KASSERT(bp->b_bcount <= todo); |
216 | KASSERT(bp->b_resid <= bp->b_bcount); | | 179 | KASSERT(bp->b_resid <= bp->b_bcount); |
217 | #endif /* defined(DIAGNOSTIC) */ | | 180 | #endif /* defined(DIAGNOSTIC) */ |
| @@ -267,38 +230,38 @@ physio(void (*strategy)(struct buf *), s | | | @@ -267,38 +230,38 @@ physio(void (*strategy)(struct buf *), s |
267 | return error; | | 230 | return error; |
268 | } | | 231 | } |
269 | | | 232 | |
270 | DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", | | 233 | DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", |
271 | __func__, uio->uio_offset, uio->uio_resid)); | | 234 | __func__, uio->uio_offset, uio->uio_resid)); |
272 | | | 235 | |
273 | flags &= B_READ | B_WRITE; | | 236 | flags &= B_READ | B_WRITE; |
274 | | | 237 | |
275 | if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) | | 238 | if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) |
276 | return ENOMEM; | | 239 | return ENOMEM; |
277 | /* ps->ps_running = 0; */ | | 240 | /* ps->ps_running = 0; */ |
278 | /* ps->ps_error = 0; */ | | 241 | /* ps->ps_error = 0; */ |
279 | /* ps->ps_failed = 0; */ | | 242 | /* ps->ps_failed = 0; */ |
| | | 243 | ps->ps_orig_bp = obp; |
280 | ps->ps_endoffset = -1; | | 244 | ps->ps_endoffset = -1; |
281 | mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); | | 245 | mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); |
282 | cv_init(&ps->ps_cv, "physio"); | | 246 | cv_init(&ps->ps_cv, "physio"); |
283 | | | 247 | |
284 | /* Make sure we have a buffer, creating one if necessary. */ | | 248 | /* Make sure we have a buffer, creating one if necessary. */ |
285 | if (obp != NULL) { | | 249 | if (obp != NULL) { |
286 | /* [raise the processor priority level to splbio;] */ | | 250 | /* [raise the processor priority level to splbio;] */ |
287 | mutex_enter(&bufcache_lock); | | 251 | mutex_enter(&bufcache_lock); |
| | | 252 | /* Mark it busy, so nobody else will use it. */ |
288 | while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) | | 253 | while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) |
289 | ; | | 254 | ; |
290 | /* Mark it busy, so nobody else will use it. */ | | | |
291 | obp->b_cflags |= BC_DONTFREE; | | | |
292 | mutex_exit(&bufcache_lock); | | 255 | mutex_exit(&bufcache_lock); |
293 | concurrency = 0; /* see "XXXkludge" comment below */ | | 256 | concurrency = 0; /* see "XXXkludge" comment below */ |
294 | } | | 257 | } |
295 | | | 258 | |
296 | uvm_lwp_hold(l); | | 259 | uvm_lwp_hold(l); |
297 | | | 260 | |
298 | for (i = 0; i < uio->uio_iovcnt; i++) { | | 261 | for (i = 0; i < uio->uio_iovcnt; i++) { |
299 | bool sync = true; | | 262 | bool sync = true; |
300 | | | 263 | |
301 | iovp = &uio->uio_iov[i]; | | 264 | iovp = &uio->uio_iov[i]; |
302 | while (iovp->iov_len > 0) { | | 265 | while (iovp->iov_len > 0) { |
303 | size_t todo; | | 266 | size_t todo; |
304 | vaddr_t endp; | | 267 | vaddr_t endp; |
| @@ -306,41 +269,42 @@ physio(void (*strategy)(struct buf *), s | | | @@ -306,41 +269,42 @@ physio(void (*strategy)(struct buf *), s |
306 | mutex_enter(&ps->ps_lock); | | 269 | mutex_enter(&ps->ps_lock); |
307 | if (ps->ps_failed != 0) { | | 270 | if (ps->ps_failed != 0) { |
308 | goto done_locked; | | 271 | goto done_locked; |
309 | } | | 272 | } |
310 | physio_wait(ps, sync ? 0 : concurrency); | | 273 | physio_wait(ps, sync ? 0 : concurrency); |
311 | mutex_exit(&ps->ps_lock); | | 274 | mutex_exit(&ps->ps_lock); |
312 | if (obp != NULL) { | | 275 | if (obp != NULL) { |
313 | /* | | 276 | /* |
314 | * XXXkludge | | 277 | * XXXkludge |
315 | * some drivers use "obp" as an identifier. | | 278 | * some drivers use "obp" as an identifier. |
316 | */ | | 279 | */ |
317 | bp = obp; | | 280 | bp = obp; |
318 | } else { | | 281 | } else { |
319 | bp = getphysbuf(); | | 282 | bp = getiobuf(NULL, true); |
| | | 283 | bp->b_cflags = BC_BUSY; |
320 | } | | 284 | } |
321 | bp->b_dev = dev; | | 285 | bp->b_dev = dev; |
322 | bp->b_proc = p; | | 286 | bp->b_proc = p; |
323 | bp->b_private = ps; | | 287 | bp->b_private = ps; |
324 | | | 288 | |
325 | /* | | 289 | /* |
326 | * [mark the buffer busy for physical I/O] | | 290 | * [mark the buffer busy for physical I/O] |
327 | * (i.e. set B_PHYS (because it's an I/O to user | | 291 | * (i.e. set B_PHYS (because it's an I/O to user |
328 | * memory, and B_RAW, because B_RAW is to be | | 292 | * memory, and B_RAW, because B_RAW is to be |
329 | * "Set by physio for raw transfers.", in addition | | 293 | * "Set by physio for raw transfers.", in addition |
330 | * to the "busy" and read/write flag.) | | 294 | * to the "busy" and read/write flag.) |
331 | */ | | 295 | */ |
332 | bp->b_oflags = 0; | | 296 | bp->b_oflags = 0; |
333 | bp->b_cflags = (bp->b_cflags & BC_DONTFREE) | BC_BUSY; | | 297 | bp->b_cflags = BC_BUSY; |
334 | bp->b_flags = flags | B_PHYS | B_RAW; | | 298 | bp->b_flags = flags | B_PHYS | B_RAW; |
335 | bp->b_iodone = physio_biodone; | | 299 | bp->b_iodone = physio_biodone; |
336 | | | 300 | |
337 | /* [set up the buffer for a maximum-sized transfer] */ | | 301 | /* [set up the buffer for a maximum-sized transfer] */ |
338 | bp->b_blkno = btodb(uio->uio_offset); | | 302 | bp->b_blkno = btodb(uio->uio_offset); |
339 | if (dbtob(bp->b_blkno) != uio->uio_offset) { | | 303 | if (dbtob(bp->b_blkno) != uio->uio_offset) { |
340 | error = EINVAL; | | 304 | error = EINVAL; |
341 | goto done; | | 305 | goto done; |
342 | } | | 306 | } |
343 | bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); | | 307 | bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); |
344 | bp->b_data = iovp->iov_base; | | 308 | bp->b_data = iovp->iov_base; |
345 | | | 309 | |
346 | /* | | 310 | /* |
| @@ -403,51 +367,50 @@ done_locked: | | | @@ -403,51 +367,50 @@ done_locked: |
403 | physio_wait(ps, 0); | | 367 | physio_wait(ps, 0); |
404 | mutex_exit(&ps->ps_lock); | | 368 | mutex_exit(&ps->ps_lock); |
405 | | | 369 | |
406 | if (ps->ps_failed != 0) { | | 370 | if (ps->ps_failed != 0) { |
407 | off_t delta; | | 371 | off_t delta; |
408 | | | 372 | |
409 | delta = uio->uio_offset - ps->ps_endoffset; | | 373 | delta = uio->uio_offset - ps->ps_endoffset; |
410 | KASSERT(delta > 0); | | 374 | KASSERT(delta > 0); |
411 | uio->uio_resid += delta; | | 375 | uio->uio_resid += delta; |
412 | /* uio->uio_offset = ps->ps_endoffset; */ | | 376 | /* uio->uio_offset = ps->ps_endoffset; */ |
413 | } else { | | 377 | } else { |
414 | KASSERT(ps->ps_endoffset == -1); | | 378 | KASSERT(ps->ps_endoffset == -1); |
415 | } | | 379 | } |
416 | if (bp != NULL) { | | 380 | if (bp != NULL && bp != obp) { |
417 | putphysbuf(bp); | | 381 | putiobuf(bp); |
418 | } | | 382 | } |
419 | if (error == 0) { | | 383 | if (error == 0) { |
420 | error = ps->ps_error; | | 384 | error = ps->ps_error; |
421 | } | | 385 | } |
422 | mutex_destroy(&ps->ps_lock); | | 386 | mutex_destroy(&ps->ps_lock); |
423 | cv_destroy(&ps->ps_cv); | | 387 | cv_destroy(&ps->ps_cv); |
424 | kmem_free(ps, sizeof(*ps)); | | 388 | kmem_free(ps, sizeof(*ps)); |
425 | | | 389 | |
426 | /* | | 390 | /* |
427 | * [clean up the state of the buffer] | | 391 | * [clean up the state of the buffer] |
428 | * Remember if somebody wants it, so we can wake them up below. | | 392 | * Remember if somebody wants it, so we can wake them up below. |
429 | * Also, if we had to steal it, give it back. | | 393 | * Also, if we had to steal it, give it back. |
430 | */ | | 394 | */ |
431 | if (obp != NULL) { | | 395 | if (obp != NULL) { |
432 | KASSERT((obp->b_cflags & BC_BUSY) != 0); | | 396 | KASSERT((obp->b_cflags & BC_BUSY) != 0); |
433 | KASSERT((obp->b_cflags & BC_DONTFREE) != 0); | | | |
434 | | | 397 | |
435 | /* | | 398 | /* |
436 | * [if another process is waiting for the raw I/O buffer, | | 399 | * [if another process is waiting for the raw I/O buffer, |
437 | * wake up processes waiting to do physical I/O; | | 400 | * wake up processes waiting to do physical I/O; |
438 | */ | | 401 | */ |
439 | mutex_enter(&bufcache_lock); | | 402 | mutex_enter(&bufcache_lock); |
440 | obp->b_cflags &= ~(BC_DONTFREE | BC_BUSY | BC_WANTED); | | 403 | obp->b_cflags &= ~(BC_BUSY | BC_WANTED); |
441 | obp->b_flags &= ~(B_PHYS | B_RAW); | | 404 | obp->b_flags &= ~(B_PHYS | B_RAW); |
442 | obp->b_iodone = NULL; | | 405 | obp->b_iodone = NULL; |
443 | cv_broadcast(&obp->b_busy); | | 406 | cv_broadcast(&obp->b_busy); |
444 | mutex_exit(&bufcache_lock); | | 407 | mutex_exit(&bufcache_lock); |
445 | } | | 408 | } |
446 | uvm_lwp_rele(l); | | 409 | uvm_lwp_rele(l); |
447 | | | 410 | |
448 | DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", | | 411 | DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", |
449 | __func__, uio->uio_offset, uio->uio_resid)); | | 412 | __func__, uio->uio_offset, uio->uio_resid)); |
450 | | | 413 | |
451 | return error; | | 414 | return error; |
452 | } | | 415 | } |
453 | | | 416 | |