| @@ -1,432 +1,434 @@ | | | @@ -1,432 +1,434 @@ |
1 | /* $NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $ */ | | 1 | /* $NetBSD: kern_physio.c,v 1.88.4.1 2009/05/27 21:42:08 snj Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 1982, 1986, 1990, 1993 | | 4 | * Copyright (c) 1982, 1986, 1990, 1993 |
5 | * The Regents of the University of California. All rights reserved. | | 5 | * The Regents of the University of California. All rights reserved. |
6 | * (c) UNIX System Laboratories, Inc. | | 6 | * (c) UNIX System Laboratories, Inc. |
7 | * All or some portions of this file are derived from material licensed | | 7 | * All or some portions of this file are derived from material licensed |
8 | * to the University of California by American Telephone and Telegraph | | 8 | * to the University of California by American Telephone and Telegraph |
9 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with | | 9 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
10 | * the permission of UNIX System Laboratories, Inc. | | 10 | * the permission of UNIX System Laboratories, Inc. |
11 | * | | 11 | * |
12 | * Redistribution and use in source and binary forms, with or without | | 12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions | | 13 | * modification, are permitted provided that the following conditions |
14 | * are met: | | 14 | * are met: |
15 | * 1. Redistributions of source code must retain the above copyright | | 15 | * 1. Redistributions of source code must retain the above copyright |
16 | * notice, this list of conditions and the following disclaimer. | | 16 | * notice, this list of conditions and the following disclaimer. |
17 | * 2. Redistributions in binary form must reproduce the above copyright | | 17 | * 2. Redistributions in binary form must reproduce the above copyright |
18 | * notice, this list of conditions and the following disclaimer in the | | 18 | * notice, this list of conditions and the following disclaimer in the |
19 | * documentation and/or other materials provided with the distribution. | | 19 | * documentation and/or other materials provided with the distribution. |
20 | * 3. Neither the name of the University nor the names of its contributors | | 20 | * 3. Neither the name of the University nor the names of its contributors |
21 | * may be used to endorse or promote products derived from this software | | 21 | * may be used to endorse or promote products derived from this software |
22 | * without specific prior written permission. | | 22 | * without specific prior written permission. |
23 | * | | 23 | * |
24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | | 24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | | 25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | | 26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
34 | * SUCH DAMAGE. | | 34 | * SUCH DAMAGE. |
35 | * | | 35 | * |
36 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 | | 36 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 |
37 | */ | | 37 | */ |
38 | | | 38 | |
39 | /*- | | 39 | /*- |
40 | * Copyright (c) 1994 Christopher G. Demetriou | | 40 | * Copyright (c) 1994 Christopher G. Demetriou |
41 | * | | 41 | * |
42 | * Redistribution and use in source and binary forms, with or without | | 42 | * Redistribution and use in source and binary forms, with or without |
43 | * modification, are permitted provided that the following conditions | | 43 | * modification, are permitted provided that the following conditions |
44 | * are met: | | 44 | * are met: |
45 | * 1. Redistributions of source code must retain the above copyright | | 45 | * 1. Redistributions of source code must retain the above copyright |
46 | * notice, this list of conditions and the following disclaimer. | | 46 | * notice, this list of conditions and the following disclaimer. |
47 | * 2. Redistributions in binary form must reproduce the above copyright | | 47 | * 2. Redistributions in binary form must reproduce the above copyright |
48 | * notice, this list of conditions and the following disclaimer in the | | 48 | * notice, this list of conditions and the following disclaimer in the |
49 | * documentation and/or other materials provided with the distribution. | | 49 | * documentation and/or other materials provided with the distribution. |
50 | * 3. All advertising materials mentioning features or use of this software | | 50 | * 3. All advertising materials mentioning features or use of this software |
51 | * must display the following acknowledgement: | | 51 | * must display the following acknowledgement: |
52 | * This product includes software developed by the University of | | 52 | * This product includes software developed by the University of |
53 | * California, Berkeley and its contributors. | | 53 | * California, Berkeley and its contributors. |
54 | * 4. Neither the name of the University nor the names of its contributors | | 54 | * 4. Neither the name of the University nor the names of its contributors |
55 | * may be used to endorse or promote products derived from this software | | 55 | * may be used to endorse or promote products derived from this software |
56 | * without specific prior written permission. | | 56 | * without specific prior written permission. |
57 | * | | 57 | * |
58 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | | 58 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
59 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | | 59 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
60 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | | 60 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
61 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 61 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
62 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 62 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
63 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 63 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
64 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 64 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
65 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 65 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
66 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 66 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
67 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 67 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
68 | * SUCH DAMAGE. | | 68 | * SUCH DAMAGE. |
69 | * | | 69 | * |
70 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 | | 70 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 |
71 | */ | | 71 | */ |
72 | | | 72 | |
73 | #include <sys/cdefs.h> | | 73 | #include <sys/cdefs.h> |
74 | __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88 2008/09/24 08:19:19 hannken Exp $"); | | 74 | __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88.4.1 2009/05/27 21:42:08 snj Exp $"); |
75 | | | 75 | |
76 | #include <sys/param.h> | | 76 | #include <sys/param.h> |
77 | #include <sys/systm.h> | | 77 | #include <sys/systm.h> |
78 | #include <sys/buf.h> | | 78 | #include <sys/buf.h> |
79 | #include <sys/proc.h> | | 79 | #include <sys/proc.h> |
80 | #include <sys/once.h> | | 80 | #include <sys/once.h> |
81 | #include <sys/workqueue.h> | | 81 | #include <sys/workqueue.h> |
82 | #include <sys/kmem.h> | | 82 | #include <sys/kmem.h> |
83 | | | 83 | |
84 | #include <uvm/uvm_extern.h> | | 84 | #include <uvm/uvm_extern.h> |
85 | | | 85 | |
86 | ONCE_DECL(physio_initialized); | | 86 | ONCE_DECL(physio_initialized); |
87 | struct workqueue *physio_workqueue; | | 87 | struct workqueue *physio_workqueue; |
88 | | | 88 | |
89 | /* | | 89 | /* |
90 | * The routines implemented in this file are described in: | | 90 | * The routines implemented in this file are described in: |
91 | * Leffler, et al.: The Design and Implementation of the 4.3BSD | | 91 | * Leffler, et al.: The Design and Implementation of the 4.3BSD |
92 | * UNIX Operating System (Addison Welley, 1989) | | 92 | * UNIX Operating System (Addison Welley, 1989) |
93 | * on pages 231-233. | | 93 | * on pages 231-233. |
94 | */ | | 94 | */ |
95 | | | 95 | |
96 | /* #define PHYSIO_DEBUG */ | | 96 | /* #define PHYSIO_DEBUG */ |
97 | #if defined(PHYSIO_DEBUG) | | 97 | #if defined(PHYSIO_DEBUG) |
98 | #define DPRINTF(a) printf a | | 98 | #define DPRINTF(a) printf a |
99 | #else /* defined(PHYSIO_DEBUG) */ | | 99 | #else /* defined(PHYSIO_DEBUG) */ |
100 | #define DPRINTF(a) /* nothing */ | | 100 | #define DPRINTF(a) /* nothing */ |
101 | #endif /* defined(PHYSIO_DEBUG) */ | | 101 | #endif /* defined(PHYSIO_DEBUG) */ |
102 | | | 102 | |
103 | struct physio_stat { | | 103 | struct physio_stat { |
104 | int ps_running; | | 104 | int ps_running; |
105 | int ps_error; | | 105 | int ps_error; |
106 | int ps_failed; | | 106 | int ps_failed; |
107 | off_t ps_endoffset; | | 107 | off_t ps_endoffset; |
108 | buf_t *ps_orig_bp; | | 108 | buf_t *ps_orig_bp; |
109 | kmutex_t ps_lock; | | 109 | kmutex_t ps_lock; |
110 | kcondvar_t ps_cv; | | 110 | kcondvar_t ps_cv; |
111 | }; | | 111 | }; |
112 | | | 112 | |
113 | static void | | 113 | static void |
114 | physio_done(struct work *wk, void *dummy) | | 114 | physio_done(struct work *wk, void *dummy) |
115 | { | | 115 | { |
116 | struct buf *bp = (void *)wk; | | 116 | struct buf *bp = (void *)wk; |
117 | size_t todo = bp->b_bufsize; | | 117 | size_t todo = bp->b_bufsize; |
118 | size_t done = bp->b_bcount - bp->b_resid; | | 118 | size_t done = bp->b_bcount - bp->b_resid; |
119 | struct physio_stat *ps = bp->b_private; | | 119 | struct physio_stat *ps = bp->b_private; |
| | | 120 | bool is_iobuf; |
120 | | | 121 | |
121 | KASSERT(&bp->b_work == wk); | | 122 | KASSERT(&bp->b_work == wk); |
122 | KASSERT(bp->b_bcount <= todo); | | 123 | KASSERT(bp->b_bcount <= todo); |
123 | KASSERT(bp->b_resid <= bp->b_bcount); | | 124 | KASSERT(bp->b_resid <= bp->b_bcount); |
124 | KASSERT((bp->b_flags & B_PHYS) != 0); | | 125 | KASSERT((bp->b_flags & B_PHYS) != 0); |
125 | KASSERT(dummy == NULL); | | 126 | KASSERT(dummy == NULL); |
126 | | | 127 | |
127 | vunmapbuf(bp, todo); | | 128 | vunmapbuf(bp, todo); |
128 | uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); | | 129 | uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); |
129 | | | 130 | |
130 | mutex_enter(&ps->ps_lock); | | 131 | mutex_enter(&ps->ps_lock); |
| | | 132 | is_iobuf = (bp != ps->ps_orig_bp); |
131 | if (__predict_false(done != todo)) { | | 133 | if (__predict_false(done != todo)) { |
132 | off_t endoffset = dbtob(bp->b_blkno) + done; | | 134 | off_t endoffset = dbtob(bp->b_blkno) + done; |
133 | | | 135 | |
134 | /* | | 136 | /* |
135 | * we got an error or hit EOM. | | 137 | * we got an error or hit EOM. |
136 | * | | 138 | * |
137 | * we only care about the first one. | | 139 | * we only care about the first one. |
138 | * ie. the one at the lowest offset. | | 140 | * ie. the one at the lowest offset. |
139 | */ | | 141 | */ |
140 | | | 142 | |
141 | KASSERT(ps->ps_endoffset != endoffset); | | 143 | KASSERT(ps->ps_endoffset != endoffset); |
142 | DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 | | 144 | DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 |
143 | ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", | | 145 | ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n", |
144 | __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, | | 146 | __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, |
145 | bp->b_blkno, bp->b_bcount, bp->b_flags)); | | 147 | bp->b_blkno, bp->b_bcount, bp->b_flags)); |
146 | | | 148 | |
147 | if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { | | 149 | if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { |
148 | DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 | | 150 | DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 |
149 | " -> %" PRIu64 "\n", | | 151 | " -> %" PRIu64 "\n", |
150 | __func__, ps, | | 152 | __func__, ps, |
151 | ps->ps_error, bp->b_error, | | 153 | ps->ps_error, bp->b_error, |
152 | ps->ps_endoffset, endoffset)); | | 154 | ps->ps_endoffset, endoffset)); |
153 | | | 155 | |
154 | ps->ps_endoffset = endoffset; | | 156 | ps->ps_endoffset = endoffset; |
155 | ps->ps_error = bp->b_error; | | 157 | ps->ps_error = bp->b_error; |
156 | } | | 158 | } |
157 | ps->ps_failed++; | | 159 | ps->ps_failed++; |
158 | } else { | | 160 | } else { |
159 | KASSERT(bp->b_error == 0); | | 161 | KASSERT(bp->b_error == 0); |
160 | } | | 162 | } |
161 | | | 163 | |
162 | ps->ps_running--; | | 164 | ps->ps_running--; |
163 | cv_signal(&ps->ps_cv); | | 165 | cv_signal(&ps->ps_cv); |
164 | mutex_exit(&ps->ps_lock); | | 166 | mutex_exit(&ps->ps_lock); |
165 | | | 167 | |
166 | if (bp != ps->ps_orig_bp) | | 168 | if (is_iobuf) |
167 | putiobuf(bp); | | 169 | putiobuf(bp); |
168 | } | | 170 | } |
169 | | | 171 | |
170 | static void | | 172 | static void |
171 | physio_biodone(struct buf *bp) | | 173 | physio_biodone(struct buf *bp) |
172 | { | | 174 | { |
173 | #if defined(DIAGNOSTIC) | | 175 | #if defined(DIAGNOSTIC) |
174 | struct physio_stat *ps = bp->b_private; | | 176 | struct physio_stat *ps = bp->b_private; |
175 | size_t todo = bp->b_bufsize; | | 177 | size_t todo = bp->b_bufsize; |
176 | | | 178 | |
177 | KASSERT(ps->ps_running > 0); | | 179 | KASSERT(ps->ps_running > 0); |
178 | KASSERT(bp->b_bcount <= todo); | | 180 | KASSERT(bp->b_bcount <= todo); |
179 | KASSERT(bp->b_resid <= bp->b_bcount); | | 181 | KASSERT(bp->b_resid <= bp->b_bcount); |
180 | #endif /* defined(DIAGNOSTIC) */ | | 182 | #endif /* defined(DIAGNOSTIC) */ |
181 | | | 183 | |
182 | workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); | | 184 | workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); |
183 | } | | 185 | } |
184 | | | 186 | |
185 | static void | | 187 | static void |
186 | physio_wait(struct physio_stat *ps, int n) | | 188 | physio_wait(struct physio_stat *ps, int n) |
187 | { | | 189 | { |
188 | | | 190 | |
189 | KASSERT(mutex_owned(&ps->ps_lock)); | | 191 | KASSERT(mutex_owned(&ps->ps_lock)); |
190 | | | 192 | |
191 | while (ps->ps_running > n) | | 193 | while (ps->ps_running > n) |
192 | cv_wait(&ps->ps_cv, &ps->ps_lock); | | 194 | cv_wait(&ps->ps_cv, &ps->ps_lock); |
193 | } | | 195 | } |
194 | | | 196 | |
195 | static int | | 197 | static int |
196 | physio_init(void) | | 198 | physio_init(void) |
197 | { | | 199 | { |
198 | int error; | | 200 | int error; |
199 | | | 201 | |
200 | KASSERT(physio_workqueue == NULL); | | 202 | KASSERT(physio_workqueue == NULL); |
201 | | | 203 | |
202 | error = workqueue_create(&physio_workqueue, "physiod", | | 204 | error = workqueue_create(&physio_workqueue, "physiod", |
203 | physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); | | 205 | physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); |
204 | | | 206 | |
205 | return error; | | 207 | return error; |
206 | } | | 208 | } |
207 | | | 209 | |
208 | #define PHYSIO_CONCURRENCY 16 /* XXX tune */ | | 210 | #define PHYSIO_CONCURRENCY 16 /* XXX tune */ |
209 | | | 211 | |
210 | /* | | 212 | /* |
211 | * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly | | 213 | * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly |
212 | * from the raw device to user buffers, and bypasses the buffer cache. | | 214 | * from the raw device to user buffers, and bypasses the buffer cache. |
213 | * | | 215 | * |
214 | * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. | | 216 | * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. |
215 | */ | | 217 | */ |
216 | int | | 218 | int |
217 | physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, | | 219 | physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, |
218 | void (*min_phys)(struct buf *), struct uio *uio) | | 220 | void (*min_phys)(struct buf *), struct uio *uio) |
219 | { | | 221 | { |
220 | struct iovec *iovp; | | 222 | struct iovec *iovp; |
221 | struct lwp *l = curlwp; | | 223 | struct lwp *l = curlwp; |
222 | struct proc *p = l->l_proc; | | 224 | struct proc *p = l->l_proc; |
223 | int i, error; | | 225 | int i, error; |
224 | struct buf *bp = NULL; | | 226 | struct buf *bp = NULL; |
225 | struct physio_stat *ps; | | 227 | struct physio_stat *ps; |
226 | int concurrency = PHYSIO_CONCURRENCY - 1; | | 228 | int concurrency = PHYSIO_CONCURRENCY - 1; |
227 | | | 229 | |
228 | error = RUN_ONCE(&physio_initialized, physio_init); | | 230 | error = RUN_ONCE(&physio_initialized, physio_init); |
229 | if (__predict_false(error != 0)) { | | 231 | if (__predict_false(error != 0)) { |
230 | return error; | | 232 | return error; |
231 | } | | 233 | } |
232 | | | 234 | |
233 | DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", | | 235 | DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", |
234 | __func__, uio->uio_offset, uio->uio_resid)); | | 236 | __func__, uio->uio_offset, uio->uio_resid)); |
235 | | | 237 | |
236 | flags &= B_READ | B_WRITE; | | 238 | flags &= B_READ | B_WRITE; |
237 | | | 239 | |
238 | if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) | | 240 | if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) |
239 | return ENOMEM; | | 241 | return ENOMEM; |
240 | /* ps->ps_running = 0; */ | | 242 | /* ps->ps_running = 0; */ |
241 | /* ps->ps_error = 0; */ | | 243 | /* ps->ps_error = 0; */ |
242 | /* ps->ps_failed = 0; */ | | 244 | /* ps->ps_failed = 0; */ |
243 | ps->ps_orig_bp = obp; | | 245 | ps->ps_orig_bp = obp; |
244 | ps->ps_endoffset = -1; | | 246 | ps->ps_endoffset = -1; |
245 | mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); | | 247 | mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); |
246 | cv_init(&ps->ps_cv, "physio"); | | 248 | cv_init(&ps->ps_cv, "physio"); |
247 | | | 249 | |
248 | /* Make sure we have a buffer, creating one if necessary. */ | | 250 | /* Make sure we have a buffer, creating one if necessary. */ |
249 | if (obp != NULL) { | | 251 | if (obp != NULL) { |
250 | /* [raise the processor priority level to splbio;] */ | | 252 | /* [raise the processor priority level to splbio;] */ |
251 | mutex_enter(&bufcache_lock); | | 253 | mutex_enter(&bufcache_lock); |
252 | /* Mark it busy, so nobody else will use it. */ | | 254 | /* Mark it busy, so nobody else will use it. */ |
253 | while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) | | 255 | while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) |
254 | ; | | 256 | ; |
255 | mutex_exit(&bufcache_lock); | | 257 | mutex_exit(&bufcache_lock); |
256 | concurrency = 0; /* see "XXXkludge" comment below */ | | 258 | concurrency = 0; /* see "XXXkludge" comment below */ |
257 | } | | 259 | } |
258 | | | 260 | |
259 | uvm_lwp_hold(l); | | 261 | uvm_lwp_hold(l); |
260 | | | 262 | |
261 | for (i = 0; i < uio->uio_iovcnt; i++) { | | 263 | for (i = 0; i < uio->uio_iovcnt; i++) { |
262 | bool sync = true; | | 264 | bool sync = true; |
263 | | | 265 | |
264 | iovp = &uio->uio_iov[i]; | | 266 | iovp = &uio->uio_iov[i]; |
265 | while (iovp->iov_len > 0) { | | 267 | while (iovp->iov_len > 0) { |
266 | size_t todo; | | 268 | size_t todo; |
267 | vaddr_t endp; | | 269 | vaddr_t endp; |
268 | | | 270 | |
269 | mutex_enter(&ps->ps_lock); | | 271 | mutex_enter(&ps->ps_lock); |
270 | if (ps->ps_failed != 0) { | | 272 | if (ps->ps_failed != 0) { |
271 | goto done_locked; | | 273 | goto done_locked; |
272 | } | | 274 | } |
273 | physio_wait(ps, sync ? 0 : concurrency); | | 275 | physio_wait(ps, sync ? 0 : concurrency); |
274 | mutex_exit(&ps->ps_lock); | | 276 | mutex_exit(&ps->ps_lock); |
275 | if (obp != NULL) { | | 277 | if (obp != NULL) { |
276 | /* | | 278 | /* |
277 | * XXXkludge | | 279 | * XXXkludge |
278 | * some drivers use "obp" as an identifier. | | 280 | * some drivers use "obp" as an identifier. |
279 | */ | | 281 | */ |
280 | bp = obp; | | 282 | bp = obp; |
281 | } else { | | 283 | } else { |
282 | bp = getiobuf(NULL, true); | | 284 | bp = getiobuf(NULL, true); |
283 | bp->b_cflags = BC_BUSY; | | 285 | bp->b_cflags = BC_BUSY; |
284 | } | | 286 | } |
285 | bp->b_dev = dev; | | 287 | bp->b_dev = dev; |
286 | bp->b_proc = p; | | 288 | bp->b_proc = p; |
287 | bp->b_private = ps; | | 289 | bp->b_private = ps; |
288 | | | 290 | |
289 | /* | | 291 | /* |
290 | * [mark the buffer busy for physical I/O] | | 292 | * [mark the buffer busy for physical I/O] |
291 | * (i.e. set B_PHYS (because it's an I/O to user | | 293 | * (i.e. set B_PHYS (because it's an I/O to user |
292 | * memory, and B_RAW, because B_RAW is to be | | 294 | * memory, and B_RAW, because B_RAW is to be |
293 | * "Set by physio for raw transfers.", in addition | | 295 | * "Set by physio for raw transfers.", in addition |
294 | * to the "busy" and read/write flag.) | | 296 | * to the "busy" and read/write flag.) |
295 | */ | | 297 | */ |
296 | bp->b_oflags = 0; | | 298 | bp->b_oflags = 0; |
297 | bp->b_cflags = BC_BUSY; | | 299 | bp->b_cflags = BC_BUSY; |
298 | bp->b_flags = flags | B_PHYS | B_RAW; | | 300 | bp->b_flags = flags | B_PHYS | B_RAW; |
299 | bp->b_iodone = physio_biodone; | | 301 | bp->b_iodone = physio_biodone; |
300 | | | 302 | |
301 | /* [set up the buffer for a maximum-sized transfer] */ | | 303 | /* [set up the buffer for a maximum-sized transfer] */ |
302 | bp->b_blkno = btodb(uio->uio_offset); | | 304 | bp->b_blkno = btodb(uio->uio_offset); |
303 | if (dbtob(bp->b_blkno) != uio->uio_offset) { | | 305 | if (dbtob(bp->b_blkno) != uio->uio_offset) { |
304 | error = EINVAL; | | 306 | error = EINVAL; |
305 | goto done; | | 307 | goto done; |
306 | } | | 308 | } |
307 | bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); | | 309 | bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); |
308 | bp->b_data = iovp->iov_base; | | 310 | bp->b_data = iovp->iov_base; |
309 | | | 311 | |
310 | /* | | 312 | /* |
311 | * [call minphys to bound the transfer size] | | 313 | * [call minphys to bound the transfer size] |
312 | * and remember the amount of data to transfer, | | 314 | * and remember the amount of data to transfer, |
313 | * for later comparison. | | 315 | * for later comparison. |
314 | */ | | 316 | */ |
315 | (*min_phys)(bp); | | 317 | (*min_phys)(bp); |
316 | todo = bp->b_bufsize = bp->b_bcount; | | 318 | todo = bp->b_bufsize = bp->b_bcount; |
317 | #if defined(DIAGNOSTIC) | | 319 | #if defined(DIAGNOSTIC) |
318 | if (todo > MAXPHYS) | | 320 | if (todo > MAXPHYS) |
319 | panic("todo(%zu) > MAXPHYS; minphys broken", | | 321 | panic("todo(%zu) > MAXPHYS; minphys broken", |
320 | todo); | | 322 | todo); |
321 | #endif /* defined(DIAGNOSTIC) */ | | 323 | #endif /* defined(DIAGNOSTIC) */ |
322 | | | 324 | |
323 | sync = false; | | 325 | sync = false; |
324 | endp = (vaddr_t)bp->b_data + todo; | | 326 | endp = (vaddr_t)bp->b_data + todo; |
325 | if (trunc_page(endp) != endp) { | | 327 | if (trunc_page(endp) != endp) { |
326 | /* | | 328 | /* |
327 | * following requests can overlap. | | 329 | * following requests can overlap. |
328 | * note that uvm_vslock does round_page. | | 330 | * note that uvm_vslock does round_page. |
329 | */ | | 331 | */ |
330 | sync = true; | | 332 | sync = true; |
331 | } | | 333 | } |
332 | | | 334 | |
333 | /* | | 335 | /* |
334 | * [lock the part of the user address space involved | | 336 | * [lock the part of the user address space involved |
335 | * in the transfer] | | 337 | * in the transfer] |
336 | * Beware vmapbuf(); it clobbers b_data and | | 338 | * Beware vmapbuf(); it clobbers b_data and |
337 | * saves it in b_saveaddr. However, vunmapbuf() | | 339 | * saves it in b_saveaddr. However, vunmapbuf() |
338 | * restores it. | | 340 | * restores it. |
339 | */ | | 341 | */ |
340 | error = uvm_vslock(p->p_vmspace, bp->b_data, todo, | | 342 | error = uvm_vslock(p->p_vmspace, bp->b_data, todo, |
341 | (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); | | 343 | (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); |
342 | if (error) { | | 344 | if (error) { |
343 | goto done; | | 345 | goto done; |
344 | } | | 346 | } |
345 | vmapbuf(bp, todo); | | 347 | vmapbuf(bp, todo); |
346 | | | 348 | |
347 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); | | 349 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
348 | | | 350 | |
349 | mutex_enter(&ps->ps_lock); | | 351 | mutex_enter(&ps->ps_lock); |
350 | ps->ps_running++; | | 352 | ps->ps_running++; |
351 | mutex_exit(&ps->ps_lock); | | 353 | mutex_exit(&ps->ps_lock); |
352 | | | 354 | |
353 | /* [call strategy to start the transfer] */ | | 355 | /* [call strategy to start the transfer] */ |
354 | (*strategy)(bp); | | 356 | (*strategy)(bp); |
355 | bp = NULL; | | 357 | bp = NULL; |
356 | | | 358 | |
357 | iovp->iov_len -= todo; | | 359 | iovp->iov_len -= todo; |
358 | iovp->iov_base = (char *)iovp->iov_base + todo; | | 360 | iovp->iov_base = (char *)iovp->iov_base + todo; |
359 | uio->uio_offset += todo; | | 361 | uio->uio_offset += todo; |
360 | uio->uio_resid -= todo; | | 362 | uio->uio_resid -= todo; |
361 | } | | 363 | } |
362 | } | | 364 | } |
363 | | | 365 | |
364 | done: | | 366 | done: |
365 | mutex_enter(&ps->ps_lock); | | 367 | mutex_enter(&ps->ps_lock); |
366 | done_locked: | | 368 | done_locked: |
367 | physio_wait(ps, 0); | | 369 | physio_wait(ps, 0); |
368 | mutex_exit(&ps->ps_lock); | | 370 | mutex_exit(&ps->ps_lock); |
369 | | | 371 | |
370 | if (ps->ps_failed != 0) { | | 372 | if (ps->ps_failed != 0) { |
371 | off_t delta; | | 373 | off_t delta; |
372 | | | 374 | |
373 | delta = uio->uio_offset - ps->ps_endoffset; | | 375 | delta = uio->uio_offset - ps->ps_endoffset; |
374 | KASSERT(delta > 0); | | 376 | KASSERT(delta > 0); |
375 | uio->uio_resid += delta; | | 377 | uio->uio_resid += delta; |
376 | /* uio->uio_offset = ps->ps_endoffset; */ | | 378 | /* uio->uio_offset = ps->ps_endoffset; */ |
377 | } else { | | 379 | } else { |
378 | KASSERT(ps->ps_endoffset == -1); | | 380 | KASSERT(ps->ps_endoffset == -1); |
379 | } | | 381 | } |
380 | if (bp != NULL && bp != obp) { | | 382 | if (bp != NULL && bp != obp) { |
381 | putiobuf(bp); | | 383 | putiobuf(bp); |
382 | } | | 384 | } |
383 | if (error == 0) { | | 385 | if (error == 0) { |
384 | error = ps->ps_error; | | 386 | error = ps->ps_error; |
385 | } | | 387 | } |
386 | mutex_destroy(&ps->ps_lock); | | 388 | mutex_destroy(&ps->ps_lock); |
387 | cv_destroy(&ps->ps_cv); | | 389 | cv_destroy(&ps->ps_cv); |
388 | kmem_free(ps, sizeof(*ps)); | | 390 | kmem_free(ps, sizeof(*ps)); |
389 | | | 391 | |
390 | /* | | 392 | /* |
391 | * [clean up the state of the buffer] | | 393 | * [clean up the state of the buffer] |
392 | * Remember if somebody wants it, so we can wake them up below. | | 394 | * Remember if somebody wants it, so we can wake them up below. |
393 | * Also, if we had to steal it, give it back. | | 395 | * Also, if we had to steal it, give it back. |
394 | */ | | 396 | */ |
395 | if (obp != NULL) { | | 397 | if (obp != NULL) { |
396 | KASSERT((obp->b_cflags & BC_BUSY) != 0); | | 398 | KASSERT((obp->b_cflags & BC_BUSY) != 0); |
397 | | | 399 | |
398 | /* | | 400 | /* |
399 | * [if another process is waiting for the raw I/O buffer, | | 401 | * [if another process is waiting for the raw I/O buffer, |
400 | * wake up processes waiting to do physical I/O; | | 402 | * wake up processes waiting to do physical I/O; |
401 | */ | | 403 | */ |
402 | mutex_enter(&bufcache_lock); | | 404 | mutex_enter(&bufcache_lock); |
403 | obp->b_cflags &= ~(BC_BUSY | BC_WANTED); | | 405 | obp->b_cflags &= ~(BC_BUSY | BC_WANTED); |
404 | obp->b_flags &= ~(B_PHYS | B_RAW); | | 406 | obp->b_flags &= ~(B_PHYS | B_RAW); |
405 | obp->b_iodone = NULL; | | 407 | obp->b_iodone = NULL; |
406 | cv_broadcast(&obp->b_busy); | | 408 | cv_broadcast(&obp->b_busy); |
407 | mutex_exit(&bufcache_lock); | | 409 | mutex_exit(&bufcache_lock); |
408 | } | | 410 | } |
409 | uvm_lwp_rele(l); | | 411 | uvm_lwp_rele(l); |
410 | | | 412 | |
411 | DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", | | 413 | DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", |
412 | __func__, uio->uio_offset, uio->uio_resid)); | | 414 | __func__, uio->uio_offset, uio->uio_resid)); |
413 | | | 415 | |
414 | return error; | | 416 | return error; |
415 | } | | 417 | } |
416 | | | 418 | |
417 | /* | | 419 | /* |
418 | * Leffler, et al., says on p. 231: | | 420 | * Leffler, et al., says on p. 231: |
419 | * "The minphys() routine is called by physio() to adjust the | | 421 | * "The minphys() routine is called by physio() to adjust the |
420 | * size of each I/O transfer before the latter is passed to | | 422 | * size of each I/O transfer before the latter is passed to |
421 | * the strategy routine..." | | 423 | * the strategy routine..." |
422 | * | | 424 | * |
423 | * so, just adjust the buffer's count accounting to MAXPHYS here, | | 425 | * so, just adjust the buffer's count accounting to MAXPHYS here, |
424 | * and return the new count; | | 426 | * and return the new count; |
425 | */ | | 427 | */ |
426 | void | | 428 | void |
427 | minphys(struct buf *bp) | | 429 | minphys(struct buf *bp) |
428 | { | | 430 | { |
429 | | | 431 | |
430 | if (bp->b_bcount > MAXPHYS) | | 432 | if (bp->b_bcount > MAXPHYS) |
431 | bp->b_bcount = MAXPHYS; | | 433 | bp->b_bcount = MAXPHYS; |
432 | } | | 434 | } |