Wed Dec 2 17:18:59 2009 UTC ()
fix inverted comparison


(pooka)
diff -r1.32 -r1.33 src/sys/rump/librump/rumpvfs/rumpblk.c

cvs diff -r1.32 -r1.33 src/sys/rump/librump/rumpvfs/rumpblk.c (switch to unified diff)

--- src/sys/rump/librump/rumpvfs/rumpblk.c 2009/11/25 15:01:28 1.32
+++ src/sys/rump/librump/rumpvfs/rumpblk.c 2009/12/02 17:18:59 1.33
@@ -1,768 +1,768 @@ @@ -1,768 +1,768 @@
1/* $NetBSD: rumpblk.c,v 1.32 2009/11/25 15:01:28 pooka Exp $ */ 1/* $NetBSD: rumpblk.c,v 1.33 2009/12/02 17:18:59 pooka Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved. 4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 * 5 *
6 * Development of this software was supported by the 6 * Development of this software was supported by the
7 * Finnish Cultural Foundation. 7 * Finnish Cultural Foundation.
8 * 8 *
9 * Redistribution and use in source and binary forms, with or without 9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions 10 * modification, are permitted provided that the following conditions
11 * are met: 11 * are met:
12 * 1. Redistributions of source code must retain the above copyright 12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer. 13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright 14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the 15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution. 16 * documentation and/or other materials provided with the distribution.
17 * 17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE. 28 * SUCH DAMAGE.
29 */ 29 */
30 30
31/* 31/*
32 * Block device emulation. Presents a block device interface and 32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests. 33 * uses rumpuser system calls to satisfy I/O requests.
34 * 34 *
35 * We provide fault injection. The driver can be made to fail 35 * We provide fault injection. The driver can be made to fail
36 * I/O occasionally. 36 * I/O occasionally.
37 * 37 *
38 * The driver also provides an optimization for regular files by 38 * The driver also provides an optimization for regular files by
39 * using memory-mapped I/O. This avoids kernel access for every 39 * using memory-mapped I/O. This avoids kernel access for every
40 * I/O operation. It also gives finer-grained control of how to 40 * I/O operation. It also gives finer-grained control of how to
41 * flush data. Additionally, in case the rump kernel dumps core, 41 * flush data. Additionally, in case the rump kernel dumps core,
42 * we get way less carnage. 42 * we get way less carnage.
43 * 43 *
44 * However, it is quite costly in writing large amounts of 44 * However, it is quite costly in writing large amounts of
45 * file data, since old contents cannot merely be overwritten, but 45 * file data, since old contents cannot merely be overwritten, but
46 * must be paged in first before replacing (i.e. r/m/w). Ideally, 46 * must be paged in first before replacing (i.e. r/m/w). Ideally,
47 * we should use directio. The problem is that directio can fail 47 * we should use directio. The problem is that directio can fail
48 * silently causing improper file system semantics (i.e. unflushed 48 * silently causing improper file system semantics (i.e. unflushed
49 * data). Therefore, default to mmap for now. Even so, directio 49 * data). Therefore, default to mmap for now. Even so, directio
50 * _should_ be safe and can be enabled by compiling this module 50 * _should_ be safe and can be enabled by compiling this module
51 * with -DHAS_DIRECTIO. 51 * with -DHAS_DIRECTIO.
52 */ 52 */
53 53
54#include <sys/cdefs.h> 54#include <sys/cdefs.h>
55__KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.32 2009/11/25 15:01:28 pooka Exp $"); 55__KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.33 2009/12/02 17:18:59 pooka Exp $");
56 56
57#include <sys/param.h> 57#include <sys/param.h>
58#include <sys/buf.h> 58#include <sys/buf.h>
59#include <sys/conf.h> 59#include <sys/conf.h>
60#include <sys/condvar.h> 60#include <sys/condvar.h>
61#include <sys/disklabel.h> 61#include <sys/disklabel.h>
62#include <sys/evcnt.h> 62#include <sys/evcnt.h>
63#include <sys/fcntl.h> 63#include <sys/fcntl.h>
64#include <sys/kmem.h> 64#include <sys/kmem.h>
65#include <sys/malloc.h> 65#include <sys/malloc.h>
66#include <sys/queue.h> 66#include <sys/queue.h>
67#include <sys/stat.h> 67#include <sys/stat.h>
68 68
69#include <rump/rumpuser.h> 69#include <rump/rumpuser.h>
70 70
71#include "rump_private.h" 71#include "rump_private.h"
72#include "rump_vfs_private.h" 72#include "rump_vfs_private.h"
73 73
74#if 0 74#if 0
75#define DPRINTF(x) printf x 75#define DPRINTF(x) printf x
76#else 76#else
77#define DPRINTF(x) 77#define DPRINTF(x)
78#endif 78#endif
79 79
80/* Default: 16 x 1MB windows */ 80/* Default: 16 x 1MB windows */
81unsigned memwinsize = (1<<20); 81unsigned memwinsize = (1<<20);
82unsigned memwincnt = 16; 82unsigned memwincnt = 16;
83 83
84#define STARTWIN(off) ((off) & ~(memwinsize-1)) 84#define STARTWIN(off) ((off) & ~(memwinsize-1))
85#define INWIN(win,off) ((win)->win_off == STARTWIN(off)) 85#define INWIN(win,off) ((win)->win_off == STARTWIN(off))
86#define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize)) 86#define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize))
87#define WINVALID(win) ((win)->win_off != (off_t)-1) 87#define WINVALID(win) ((win)->win_off != (off_t)-1)
88#define WINVALIDATE(win) ((win)->win_off = (off_t)-1) 88#define WINVALIDATE(win) ((win)->win_off = (off_t)-1)
89struct blkwin { 89struct blkwin {
90 off_t win_off; 90 off_t win_off;
91 void *win_mem; 91 void *win_mem;
92 int win_refcnt; 92 int win_refcnt;
93 93
94 TAILQ_ENTRY(blkwin) win_lru; 94 TAILQ_ENTRY(blkwin) win_lru;
95}; 95};
96 96
97#define RUMPBLK_SIZE 16 97#define RUMPBLK_SIZE 16
98static struct rblkdev { 98static struct rblkdev {
99 char *rblk_path; 99 char *rblk_path;
100 int rblk_fd; 100 int rblk_fd;
101 int rblk_opencnt; 101 int rblk_opencnt;
102#ifdef HAS_ODIRECT 102#ifdef HAS_ODIRECT
103 int rblk_dfd; 103 int rblk_dfd;
104#endif 104#endif
105 uint64_t rblk_size; 105 uint64_t rblk_size;
106 uint64_t rblk_hostoffset; 106 uint64_t rblk_hostoffset;
107 int rblk_ftype; 107 int rblk_ftype;
108 108
109 /* for mmap */ 109 /* for mmap */
110 int rblk_mmflags; 110 int rblk_mmflags;
111 kmutex_t rblk_memmtx; 111 kmutex_t rblk_memmtx;
112 kcondvar_t rblk_memcv; 112 kcondvar_t rblk_memcv;
113 TAILQ_HEAD(winlru, blkwin) rblk_lruq; 113 TAILQ_HEAD(winlru, blkwin) rblk_lruq;
114 bool rblk_waiting; 114 bool rblk_waiting;
115 115
116 struct disklabel rblk_label; 116 struct disklabel rblk_label;
117} minors[RUMPBLK_SIZE]; 117} minors[RUMPBLK_SIZE];
118 118
119static struct evcnt ev_io_total; 119static struct evcnt ev_io_total;
120static struct evcnt ev_io_async; 120static struct evcnt ev_io_async;
121 121
122static struct evcnt ev_memblk_hits; 122static struct evcnt ev_memblk_hits;
123static struct evcnt ev_memblk_busy; 123static struct evcnt ev_memblk_busy;
124 124
125static struct evcnt ev_bwrite_total; 125static struct evcnt ev_bwrite_total;
126static struct evcnt ev_bwrite_async; 126static struct evcnt ev_bwrite_async;
127static struct evcnt ev_bread_total; 127static struct evcnt ev_bread_total;
128 128
129dev_type_open(rumpblk_open); 129dev_type_open(rumpblk_open);
130dev_type_close(rumpblk_close); 130dev_type_close(rumpblk_close);
131dev_type_read(rumpblk_read); 131dev_type_read(rumpblk_read);
132dev_type_write(rumpblk_write); 132dev_type_write(rumpblk_write);
133dev_type_ioctl(rumpblk_ioctl); 133dev_type_ioctl(rumpblk_ioctl);
134dev_type_strategy(rumpblk_strategy); 134dev_type_strategy(rumpblk_strategy);
135dev_type_strategy(rumpblk_strategy_fail); 135dev_type_strategy(rumpblk_strategy_fail);
136dev_type_dump(rumpblk_dump); 136dev_type_dump(rumpblk_dump);
137dev_type_size(rumpblk_size); 137dev_type_size(rumpblk_size);
138 138
139static const struct bdevsw rumpblk_bdevsw = { 139static const struct bdevsw rumpblk_bdevsw = {
140 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl, 140 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
141 nodump, nosize, D_DISK 141 nodump, nosize, D_DISK
142}; 142};
143 143
144static const struct bdevsw rumpblk_bdevsw_fail = { 144static const struct bdevsw rumpblk_bdevsw_fail = {
145 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl, 145 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
146 nodump, nosize, D_DISK 146 nodump, nosize, D_DISK
147}; 147};
148 148
149static const struct cdevsw rumpblk_cdevsw = { 149static const struct cdevsw rumpblk_cdevsw = {
150 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write, 150 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
151 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK 151 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
152}; 152};
153 153
154/* fail every n out of BLKFAIL_MAX */ 154/* fail every n out of BLKFAIL_MAX */
155#define BLKFAIL_MAX 10000 155#define BLKFAIL_MAX 10000
156static int blkfail; 156static int blkfail;
157static unsigned randstate; 157static unsigned randstate;
158static kmutex_t rumpblk_lock; 158static kmutex_t rumpblk_lock;
159 159
160static void 160static void
161makedefaultlabel(struct disklabel *lp, off_t size, int part) 161makedefaultlabel(struct disklabel *lp, off_t size, int part)
162{ 162{
163 int i; 163 int i;
164 164
165 memset(lp, 0, sizeof(*lp)); 165 memset(lp, 0, sizeof(*lp));
166 166
167 lp->d_secperunit = size; 167 lp->d_secperunit = size;
168 lp->d_secsize = DEV_BSIZE; 168 lp->d_secsize = DEV_BSIZE;
169 lp->d_nsectors = size >> DEV_BSHIFT; 169 lp->d_nsectors = size >> DEV_BSHIFT;
170 lp->d_ntracks = 1; 170 lp->d_ntracks = 1;
171 lp->d_ncylinders = 1; 171 lp->d_ncylinders = 1;
172 lp->d_secpercyl = lp->d_nsectors; 172 lp->d_secpercyl = lp->d_nsectors;
173 173
174 /* oh dear oh dear */ 174 /* oh dear oh dear */
175 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); 175 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
176 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 176 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
177 177
178 lp->d_type = DTYPE_RUMPD; 178 lp->d_type = DTYPE_RUMPD;
179 lp->d_rpm = 11; 179 lp->d_rpm = 11;
180 lp->d_interleave = 1; 180 lp->d_interleave = 1;
181 lp->d_flags = 0; 181 lp->d_flags = 0;
182 182
183 /* XXX: RAW_PART handling? */ 183 /* XXX: RAW_PART handling? */
184 for (i = 0; i < part; i++) { 184 for (i = 0; i < part; i++) {
185 lp->d_partitions[i].p_fstype = FS_UNUSED; 185 lp->d_partitions[i].p_fstype = FS_UNUSED;
186 } 186 }
187 lp->d_partitions[part].p_size = size >> DEV_BSHIFT; 187 lp->d_partitions[part].p_size = size >> DEV_BSHIFT;
188 lp->d_npartitions = part+1; 188 lp->d_npartitions = part+1;
189 /* XXX: file system type? */ 189 /* XXX: file system type? */
190 190
191 lp->d_magic = DISKMAGIC; 191 lp->d_magic = DISKMAGIC;
192 lp->d_magic2 = DISKMAGIC; 192 lp->d_magic2 = DISKMAGIC;
193 lp->d_checksum = 0; /* XXX */ 193 lp->d_checksum = 0; /* XXX */
194} 194}
195 195
196static struct blkwin * 196static struct blkwin *
197getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error) 197getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error)
198{ 198{
199 struct blkwin *win; 199 struct blkwin *win;
200 200
201 mutex_enter(&rblk->rblk_memmtx); 201 mutex_enter(&rblk->rblk_memmtx);
202 retry: 202 retry:
203 /* search for window */ 203 /* search for window */
204 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) { 204 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) {
205 if (INWIN(win, off) && WINVALID(win)) 205 if (INWIN(win, off) && WINVALID(win))
206 break; 206 break;
207 } 207 }
208 208
209 /* found? return */ 209 /* found? return */
210 if (win) { 210 if (win) {
211 ev_memblk_hits.ev_count++; 211 ev_memblk_hits.ev_count++;
212 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 212 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
213 goto good; 213 goto good;
214 } 214 }
215 215
216 /* 216 /*
217 * Else, create new window. If the least recently used is not 217 * Else, create new window. If the least recently used is not
218 * currently in use, reuse that. Otherwise we need to wait. 218 * currently in use, reuse that. Otherwise we need to wait.
219 */ 219 */
220 win = TAILQ_LAST(&rblk->rblk_lruq, winlru); 220 win = TAILQ_LAST(&rblk->rblk_lruq, winlru);
221 if (win->win_refcnt == 0) { 221 if (win->win_refcnt == 0) {
222 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 222 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
223 mutex_exit(&rblk->rblk_memmtx); 223 mutex_exit(&rblk->rblk_memmtx);
224 224
225 if (WINVALID(win)) { 225 if (WINVALID(win)) {
226 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n", 226 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n",
227 win, win->win_mem, win->win_off)); 227 win, win->win_mem, win->win_off));
228 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 228 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
229 WINVALIDATE(win); 229 WINVALIDATE(win);
230 } 230 }
231 231
232 win->win_off = STARTWIN(off); 232 win->win_off = STARTWIN(off);
233 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off, 233 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off,
234 WINSIZE(rblk, win), rblk->rblk_mmflags, error); 234 WINSIZE(rblk, win), rblk->rblk_mmflags, error);
235 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n", 235 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n",
236 win, win->win_off, win->win_mem)); 236 win, win->win_off, win->win_mem));
237 237
238 mutex_enter(&rblk->rblk_memmtx); 238 mutex_enter(&rblk->rblk_memmtx);
239 if (win->win_mem == NULL) { 239 if (win->win_mem == NULL) {
240 WINVALIDATE(win); 240 WINVALIDATE(win);
241 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 241 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
242 mutex_exit(&rblk->rblk_memmtx); 242 mutex_exit(&rblk->rblk_memmtx);
243 return NULL; 243 return NULL;
244 } 244 }
245 } else { 245 } else {
246 DPRINTF(("memwin wait\n")); 246 DPRINTF(("memwin wait\n"));
247 ev_memblk_busy.ev_count++; 247 ev_memblk_busy.ev_count++;
248 248
249 rblk->rblk_waiting = true; 249 rblk->rblk_waiting = true;
250 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx); 250 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx);
251 goto retry; 251 goto retry;
252 } 252 }
253 253
254 good: 254 good:
255 KASSERT(win); 255 KASSERT(win);
256 win->win_refcnt++; 256 win->win_refcnt++;
257 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru); 257 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru);
258 mutex_exit(&rblk->rblk_memmtx); 258 mutex_exit(&rblk->rblk_memmtx);
259 *wsize = MIN(*wsize, memwinsize - (off-win->win_off)); 259 *wsize = MIN(*wsize, memwinsize - (off-win->win_off));
260 KASSERT(*wsize); 260 KASSERT(*wsize);
261 261
262 return win; 262 return win;
263} 263}
264 264
265static void 265static void
266putwindow(struct rblkdev *rblk, struct blkwin *win) 266putwindow(struct rblkdev *rblk, struct blkwin *win)
267{ 267{
268 268
269 mutex_enter(&rblk->rblk_memmtx); 269 mutex_enter(&rblk->rblk_memmtx);
270 if (--win->win_refcnt == 0 && rblk->rblk_waiting) { 270 if (--win->win_refcnt == 0 && rblk->rblk_waiting) {
271 rblk->rblk_waiting = false; 271 rblk->rblk_waiting = false;
272 cv_signal(&rblk->rblk_memcv); 272 cv_signal(&rblk->rblk_memcv);
273 } 273 }
274 KASSERT(win->win_refcnt >= 0); 274 KASSERT(win->win_refcnt >= 0);
275 mutex_exit(&rblk->rblk_memmtx); 275 mutex_exit(&rblk->rblk_memmtx);
276} 276}
277 277
278static void 278static void
279wincleanup(struct rblkdev *rblk) 279wincleanup(struct rblkdev *rblk)
280{ 280{
281 struct blkwin *win; 281 struct blkwin *win;
282 282
283 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) { 283 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) {
284 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 284 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru);
285 if (WINVALID(win)) { 285 if (WINVALID(win)) {
286 DPRINTF(("cleanup win %p addr %p\n", 286 DPRINTF(("cleanup win %p addr %p\n",
287 win, win->win_mem)); 287 win, win->win_mem));
288 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 288 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win));
289 } 289 }
290 kmem_free(win, sizeof(*win)); 290 kmem_free(win, sizeof(*win));
291 } 291 }
292 rblk->rblk_mmflags = 0; 292 rblk->rblk_mmflags = 0;
293} 293}
294 294
295int 295int
296rumpblk_init(void) 296rumpblk_init(void)
297{ 297{
298 char buf[64]; 298 char buf[64];
299 int rumpblk = RUMPBLK; 299 int rumpblk = RUMPBLK;
300 unsigned tmp; 300 unsigned tmp;
301 int error, i; 301 int error, i;
302 302
303 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); 303 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
304 304
305 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) { 305 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
306 blkfail = strtoul(buf, NULL, 10); 306 blkfail = strtoul(buf, NULL, 10);
307 /* fail everything */ 307 /* fail everything */
308 if (blkfail > BLKFAIL_MAX) 308 if (blkfail > BLKFAIL_MAX)
309 blkfail = BLKFAIL_MAX; 309 blkfail = BLKFAIL_MAX;
310 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf), 310 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
311 &error) == 0) { 311 &error) == 0) {
312 randstate = strtoul(buf, NULL, 10); 312 randstate = strtoul(buf, NULL, 10);
313 } else { 313 } else {
314 randstate = arc4random(); 314 randstate = arc4random();
315 } 315 }
316 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " 316 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
317 "seed %u\n", blkfail, BLKFAIL_MAX, randstate); 317 "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
318 } else { 318 } else {
319 blkfail = 0; 319 blkfail = 0;
320 } 320 }
321 321
322 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) { 322 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) {
323 printf("rumpblk: "); 323 printf("rumpblk: ");
324 tmp = strtoul(buf, NULL, 10); 324 tmp = strtoul(buf, NULL, 10);
325 if (tmp && !(tmp & (tmp-1))) 325 if (tmp && !(tmp & (tmp-1)))
326 memwinsize = tmp; 326 memwinsize = tmp;
327 else 327 else
328 printf("invalid RUMP_BLKWINSIZE %d, ", tmp); 328 printf("invalid RUMP_BLKWINSIZE %d, ", tmp);
329 printf("using %d for memwinsize\n", memwinsize); 329 printf("using %d for memwinsize\n", memwinsize);
330 } 330 }
331 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){ 331 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){
332 printf("rumpblk: "); 332 printf("rumpblk: ");
333 tmp = strtoul(buf, NULL, 10); 333 tmp = strtoul(buf, NULL, 10);
334 if (tmp) 334 if (tmp)
335 memwincnt = tmp; 335 memwincnt = tmp;
336 else 336 else
337 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp); 337 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp);
338 printf("using %d for memwincount\n", memwincnt); 338 printf("using %d for memwincount\n", memwincnt);
339 } 339 }
340 340
341 memset(minors, 0, sizeof(minors)); 341 memset(minors, 0, sizeof(minors));
342 for (i = 0; i < RUMPBLK_SIZE; i++) { 342 for (i = 0; i < RUMPBLK_SIZE; i++) {
343 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE); 343 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE);
344 cv_init(&minors[i].rblk_memcv, "rblkmcv"); 344 cv_init(&minors[i].rblk_memcv, "rblkmcv");
345 } 345 }
346 346
347 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, 347 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
348 "rumpblk", "rumpblk I/O reqs"); 348 "rumpblk", "rumpblk I/O reqs");
349 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, 349 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
350 "rumpblk", "rumpblk async I/O"); 350 "rumpblk", "rumpblk async I/O");
351 351
352 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, 352 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
353 "rumpblk", "rumpblk bytes read"); 353 "rumpblk", "rumpblk bytes read");
354 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, 354 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
355 "rumpblk", "rumpblk bytes written"); 355 "rumpblk", "rumpblk bytes written");
356 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, 356 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
357 "rumpblk", "rumpblk bytes written async"); 357 "rumpblk", "rumpblk bytes written async");
358 358
359 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL, 359 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL,
360 "rumpblk", "memblk window hits"); 360 "rumpblk", "memblk window hits");
361 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL, 361 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL,
362 "rumpblk", "memblk all windows busy"); 362 "rumpblk", "memblk all windows busy");
363 363
364 if (blkfail) { 364 if (blkfail) {
365 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk, 365 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
366 &rumpblk_cdevsw, &rumpblk); 366 &rumpblk_cdevsw, &rumpblk);
367 } else { 367 } else {
368 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk, 368 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
369 &rumpblk_cdevsw, &rumpblk); 369 &rumpblk_cdevsw, &rumpblk);
370 } 370 }
371} 371}
372 372
373/* XXX: no deregister */ 373/* XXX: no deregister */
374int 374int
375rumpblk_register(const char *path, devminor_t *dmin, 375rumpblk_register(const char *path, devminor_t *dmin,
376 uint64_t offset, uint64_t size) 376 uint64_t offset, uint64_t size)
377{ 377{
378 struct rblkdev *rblk; 378 struct rblkdev *rblk;
379 uint64_t flen; 379 uint64_t flen;
380 size_t len; 380 size_t len;
381 int ftype, error, i; 381 int ftype, error, i;
382 382
383 /* devices might not report correct size unless they're open */ 383 /* devices might not report correct size unless they're open */
384 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1) 384 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1)
385 return error; 385 return error;
386 386
387 /* verify host file is of supported type */ 387 /* verify host file is of supported type */
388 if (!(ftype == RUMPUSER_FT_REG 388 if (!(ftype == RUMPUSER_FT_REG
389 || ftype == RUMPUSER_FT_BLK 389 || ftype == RUMPUSER_FT_BLK
390 || ftype == RUMPUSER_FT_CHR)) 390 || ftype == RUMPUSER_FT_CHR))
391 return EINVAL; 391 return EINVAL;
392 392
393 mutex_enter(&rumpblk_lock); 393 mutex_enter(&rumpblk_lock);
394 for (i = 0; i < RUMPBLK_SIZE; i++) { 394 for (i = 0; i < RUMPBLK_SIZE; i++) {
395 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 395 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
396 mutex_exit(&rumpblk_lock); 396 mutex_exit(&rumpblk_lock);
397 *dmin = i; 397 *dmin = i;
398 return 0; 398 return 0;
399 } 399 }
400 } 400 }
401 401
402 for (i = 0; i < RUMPBLK_SIZE; i++) 402 for (i = 0; i < RUMPBLK_SIZE; i++)
403 if (minors[i].rblk_path == NULL) 403 if (minors[i].rblk_path == NULL)
404 break; 404 break;
405 if (i == RUMPBLK_SIZE) { 405 if (i == RUMPBLK_SIZE) {
406 mutex_exit(&rumpblk_lock); 406 mutex_exit(&rumpblk_lock);
407 return EBUSY; 407 return EBUSY;
408 } 408 }
409 409
410 rblk = &minors[i]; 410 rblk = &minors[i];
411 len = strlen(path); 411 len = strlen(path);
412 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); 412 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
413 strcpy(rblk->rblk_path, path); 413 strcpy(rblk->rblk_path, path);
414 rblk->rblk_fd = -1; 414 rblk->rblk_fd = -1;
415 rblk->rblk_hostoffset = offset; 415 rblk->rblk_hostoffset = offset;
416 if (size == RUMPBLK_SIZENOTSET) { 416 if (size != RUMPBLK_SIZENOTSET) {
417 KASSERT(size + offset <= flen); 417 KASSERT(size + offset <= flen);
418 rblk->rblk_size = size; 418 rblk->rblk_size = size;
419 } else { 419 } else {
420 KASSERT(offset < flen); 420 KASSERT(offset < flen);
421 rblk->rblk_size = flen - offset; 421 rblk->rblk_size = flen - offset;
422 } 422 }
423 rblk->rblk_ftype = ftype; 423 rblk->rblk_ftype = ftype;
424 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); 424 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
425 mutex_exit(&rumpblk_lock); 425 mutex_exit(&rumpblk_lock);
426 426
427 *dmin = i; 427 *dmin = i;
428 return 0; 428 return 0;
429} 429}
430 430
431int 431int
432rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) 432rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
433{ 433{
434 struct rblkdev *rblk = &minors[minor(dev)]; 434 struct rblkdev *rblk = &minors[minor(dev)];
435 int error, fd; 435 int error, fd;
436 436
437 if (rblk->rblk_path == NULL) 437 if (rblk->rblk_path == NULL)
438 return ENXIO; 438 return ENXIO;
439 439
440 if (rblk->rblk_fd != -1) 440 if (rblk->rblk_fd != -1)
441 return 0; /* XXX: refcount, open mode */ 441 return 0; /* XXX: refcount, open mode */
442 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error); 442 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
443 if (error) 443 if (error)
444 return error; 444 return error;
445 445
446#ifdef HAS_ODIRECT 446#ifdef HAS_ODIRECT
447 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path, 447 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path,
448 OFLAGS(flag) | O_DIRECT, &error); 448 OFLAGS(flag) | O_DIRECT, &error);
449 if (error) 449 if (error)
450 return error; 450 return error;
451#endif 451#endif
452 452
453 if (rblk->rblk_ftype == RUMPUSER_FT_REG) { 453 if (rblk->rblk_ftype == RUMPUSER_FT_REG) {
454 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset; 454 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset;
455 struct blkwin *win; 455 struct blkwin *win;
456 int i, winsize; 456 int i, winsize;
457 457
458 /* 458 /*
459 * Use mmap to access a regular file. Allocate and 459 * Use mmap to access a regular file. Allocate and
460 * cache initial windows here. Failure to allocate one 460 * cache initial windows here. Failure to allocate one
461 * means fallback to read/write i/o. 461 * means fallback to read/write i/o.
462 */ 462 */
463 463
464 rblk->rblk_mmflags = 0; 464 rblk->rblk_mmflags = 0;
465 if (flag & FREAD) 465 if (flag & FREAD)
466 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ; 466 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ;
467 if (flag & FWRITE) { 467 if (flag & FWRITE) {
468 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE; 468 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE;
469 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED; 469 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED;
470 } 470 }
471 471
472 TAILQ_INIT(&rblk->rblk_lruq); 472 TAILQ_INIT(&rblk->rblk_lruq);
473 rblk->rblk_fd = fd; 473 rblk->rblk_fd = fd;
474 474
475 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) { 475 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) {
476 win = kmem_zalloc(sizeof(*win), KM_SLEEP); 476 win = kmem_zalloc(sizeof(*win), KM_SLEEP);
477 WINVALIDATE(win); 477 WINVALIDATE(win);
478 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 478 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru);
479 479
480 /* 480 /*
481 * Allocate first windows. Here we just generally 481 * Allocate first windows. Here we just generally
482 * make sure a) we can mmap at all b) we have the 482 * make sure a) we can mmap at all b) we have the
483 * necessary VA available 483 * necessary VA available
484 */ 484 */
485 winsize = memwinsize; 485 winsize = memwinsize;
486 win = getwindow(rblk, off + i*memwinsize, &winsize, 486 win = getwindow(rblk, off + i*memwinsize, &winsize,
487 &error);  487 &error);
488 if (win) { 488 if (win) {
489 putwindow(rblk, win); 489 putwindow(rblk, win);
490 } else { 490 } else {
491 wincleanup(rblk); 491 wincleanup(rblk);
492 break; 492 break;
493 } 493 }
494 } 494 }
495 } 495 }
496 496
497 KASSERT(rblk->rblk_fd != -1); 497 KASSERT(rblk->rblk_fd != -1);
498 return 0; 498 return 0;
499} 499}
500 500
501int 501int
502rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) 502rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
503{ 503{
504 struct rblkdev *rblk = &minors[minor(dev)]; 504 struct rblkdev *rblk = &minors[minor(dev)];
505 int dummy; 505 int dummy;
506 506
507 if (rblk->rblk_mmflags) 507 if (rblk->rblk_mmflags)
508 wincleanup(rblk); 508 wincleanup(rblk);
509 rumpuser_fsync(rblk->rblk_fd, &dummy); 509 rumpuser_fsync(rblk->rblk_fd, &dummy);
510 rumpuser_close(rblk->rblk_fd, &dummy); 510 rumpuser_close(rblk->rblk_fd, &dummy);
511 rblk->rblk_fd = -1; 511 rblk->rblk_fd = -1;
512 512
513 return 0; 513 return 0;
514} 514}
515 515
516int 516int
517rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) 517rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
518{ 518{
519 devminor_t dmin = minor(dev); 519 devminor_t dmin = minor(dev);
520 struct rblkdev *rblk = &minors[dmin]; 520 struct rblkdev *rblk = &minors[dmin];
521 struct partinfo *pi; 521 struct partinfo *pi;
522 int error = 0; 522 int error = 0;
523 523
524 /* well, me should support a few more, but we don't for now */ 524 /* well, me should support a few more, but we don't for now */
525 switch (xfer) { 525 switch (xfer) {
526 case DIOCGDINFO: 526 case DIOCGDINFO:
527 *(struct disklabel *)addr = rblk->rblk_label; 527 *(struct disklabel *)addr = rblk->rblk_label;
528 break; 528 break;
529 529
530 case DIOCGPART: 530 case DIOCGPART:
531 pi = addr; 531 pi = addr;
532 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; 532 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
533 pi->disklab = &rblk->rblk_label; 533 pi->disklab = &rblk->rblk_label;
534 break; 534 break;
535 535
536 /* it's synced enough along the write path */ 536 /* it's synced enough along the write path */
537 case DIOCCACHESYNC: 537 case DIOCCACHESYNC:
538 break; 538 break;
539 539
540 default: 540 default:
541 error = ENOTTY; 541 error = ENOTTY;
542 break; 542 break;
543 } 543 }
544 544
545 return error; 545 return error;
546} 546}
547 547
548static int 548static int
549do_physio(dev_t dev, struct uio *uio, int which) 549do_physio(dev_t dev, struct uio *uio, int which)
550{ 550{
551 void (*strat)(struct buf *); 551 void (*strat)(struct buf *);
552 552
553 if (blkfail) 553 if (blkfail)
554 strat = rumpblk_strategy_fail; 554 strat = rumpblk_strategy_fail;
555 else 555 else
556 strat = rumpblk_strategy; 556 strat = rumpblk_strategy;
557 557
558 return physio(strat, NULL, dev, which, minphys, uio); 558 return physio(strat, NULL, dev, which, minphys, uio);
559} 559}
560 560
561int 561int
562rumpblk_read(dev_t dev, struct uio *uio, int flags) 562rumpblk_read(dev_t dev, struct uio *uio, int flags)
563{ 563{
564 564
565 return do_physio(dev, uio, B_READ); 565 return do_physio(dev, uio, B_READ);
566} 566}
567 567
568int 568int
569rumpblk_write(dev_t dev, struct uio *uio, int flags) 569rumpblk_write(dev_t dev, struct uio *uio, int flags)
570{ 570{
571 571
572 return do_physio(dev, uio, B_WRITE); 572 return do_physio(dev, uio, B_WRITE);
573} 573}
574 574
575static void 575static void
576dostrategy(struct buf *bp) 576dostrategy(struct buf *bp)
577{ 577{
578 struct rblkdev *rblk = &minors[minor(bp->b_dev)]; 578 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
579 off_t off; 579 off_t off;
580 int async = bp->b_flags & B_ASYNC; 580 int async = bp->b_flags & B_ASYNC;
581 int error; 581 int error;
582 582
583 /* collect statistics */ 583 /* collect statistics */
584 ev_io_total.ev_count++; 584 ev_io_total.ev_count++;
585 if (async) 585 if (async)
586 ev_io_async.ev_count++; 586 ev_io_async.ev_count++;
587 if (BUF_ISWRITE(bp)) { 587 if (BUF_ISWRITE(bp)) {
588 ev_bwrite_total.ev_count += bp->b_bcount; 588 ev_bwrite_total.ev_count += bp->b_bcount;
589 if (async) 589 if (async)
590 ev_bwrite_async.ev_count += bp->b_bcount; 590 ev_bwrite_async.ev_count += bp->b_bcount;
591 } else { 591 } else {
592 ev_bread_total.ev_count++; 592 ev_bread_total.ev_count++;
593 } 593 }
594 594
595 off = bp->b_blkno << DEV_BSHIFT; 595 off = bp->b_blkno << DEV_BSHIFT;
596 off += rblk->rblk_hostoffset; 596 off += rblk->rblk_hostoffset;
597 /* 597 /*
598 * Do bounds checking if we're working on a file. Otherwise 598 * Do bounds checking if we're working on a file. Otherwise
599 * invalid file systems might attempt to read beyond EOF. This 599 * invalid file systems might attempt to read beyond EOF. This
600 * is bad(tm) especially on mmapped images. This is essentially 600 * is bad(tm) especially on mmapped images. This is essentially
601 * the kernel bounds_check() routines. 601 * the kernel bounds_check() routines.
602 */ 602 */
603 if (off + bp->b_bcount > rblk->rblk_size) { 603 if (off + bp->b_bcount > rblk->rblk_size) {
604 int64_t sz = rblk->rblk_size - off; 604 int64_t sz = rblk->rblk_size - off;
605 605
606 /* EOF */ 606 /* EOF */
607 if (sz == 0) { 607 if (sz == 0) {
608 rump_biodone(bp, 0, 0); 608 rump_biodone(bp, 0, 0);
609 return; 609 return;
610 } 610 }
611 /* beyond EOF ==> error */ 611 /* beyond EOF ==> error */
612 if (sz < 0) { 612 if (sz < 0) {
613 rump_biodone(bp, 0, EINVAL); 613 rump_biodone(bp, 0, EINVAL);
614 return; 614 return;
615 } 615 }
616 616
617 /* truncate to device size */ 617 /* truncate to device size */
618 bp->b_bcount = sz; 618 bp->b_bcount = sz;
619 } 619 }
620 620
621 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 621 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
622 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", 622 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
623 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", 623 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
624 off, off, (off + bp->b_bcount), async ? "a" : "")); 624 off, off, (off + bp->b_bcount), async ? "a" : ""));
625 625
626 /* mmap? handle here and return */ 626 /* mmap? handle here and return */
627 if (rblk->rblk_mmflags) { 627 if (rblk->rblk_mmflags) {
628 struct blkwin *win; 628 struct blkwin *win;
629 int winsize, iodone; 629 int winsize, iodone;
630 uint8_t *ioaddr, *bufaddr; 630 uint8_t *ioaddr, *bufaddr;
631 631
632 for (iodone = 0; iodone < bp->b_bcount; 632 for (iodone = 0; iodone < bp->b_bcount;
633 iodone += winsize, off += winsize) { 633 iodone += winsize, off += winsize) {
634 winsize = bp->b_bcount - iodone; 634 winsize = bp->b_bcount - iodone;
635 win = getwindow(rblk, off, &winsize, &error);  635 win = getwindow(rblk, off, &winsize, &error);
636 if (win == NULL) { 636 if (win == NULL) {
637 rump_biodone(bp, iodone, error); 637 rump_biodone(bp, iodone, error);
638 return; 638 return;
639 } 639 }
640 640
641 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off)); 641 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off));
642 bufaddr = (uint8_t *)bp->b_data + iodone; 642 bufaddr = (uint8_t *)bp->b_data + iodone;
643 643
644 DPRINTF(("strat: %p off 0x%" PRIx64 644 DPRINTF(("strat: %p off 0x%" PRIx64
645 ", ioaddr %p (%p)/buf %p\n", win, 645 ", ioaddr %p (%p)/buf %p\n", win,
646 win->win_off, ioaddr, win->win_mem, bufaddr)); 646 win->win_off, ioaddr, win->win_mem, bufaddr));
647 if (BUF_ISREAD(bp)) { 647 if (BUF_ISREAD(bp)) {
648 memcpy(bufaddr, ioaddr, winsize); 648 memcpy(bufaddr, ioaddr, winsize);
649 } else { 649 } else {
650 memcpy(ioaddr, bufaddr, winsize); 650 memcpy(ioaddr, bufaddr, winsize);
651 } 651 }
652 652
653 /* synchronous write, sync bits back to disk */ 653 /* synchronous write, sync bits back to disk */
654 if (BUF_ISWRITE(bp) && !async) { 654 if (BUF_ISWRITE(bp) && !async) {
655 rumpuser_memsync(ioaddr, winsize, &error); 655 rumpuser_memsync(ioaddr, winsize, &error);
656 } 656 }
657 putwindow(rblk, win); 657 putwindow(rblk, win);
658 } 658 }
659 659
660 rump_biodone(bp, bp->b_bcount, 0); 660 rump_biodone(bp, bp->b_bcount, 0);
661 return; 661 return;
662 } 662 }
663 663
664 /* 664 /*
665 * Do I/O. We have different paths for async and sync I/O. 665 * Do I/O. We have different paths for async and sync I/O.
666 * Async I/O is done by passing a request to rumpuser where 666 * Async I/O is done by passing a request to rumpuser where
667 * it is executed. The rumpuser routine then calls 667 * it is executed. The rumpuser routine then calls
668 * biodone() to signal any waiters in the kernel. I/O's are 668 * biodone() to signal any waiters in the kernel. I/O's are
669 * executed in series. Technically executing them in parallel 669 * executed in series. Technically executing them in parallel
670 * would produce better results, but then we'd need either 670 * would produce better results, but then we'd need either
671 * more threads or posix aio. Maybe worth investigating 671 * more threads or posix aio. Maybe worth investigating
672 * this later. 672 * this later.
673 *  673 *
674 * Using bufq here might be a good idea. 674 * Using bufq here might be a good idea.
675 */ 675 */
676 676
677 if (rump_threads) { 677 if (rump_threads) {
678 struct rumpuser_aio *rua; 678 struct rumpuser_aio *rua;
679 int op, fd; 679 int op, fd;
680 680
681 fd = rblk->rblk_fd; 681 fd = rblk->rblk_fd;
682 if (BUF_ISREAD(bp)) { 682 if (BUF_ISREAD(bp)) {
683 op = RUA_OP_READ; 683 op = RUA_OP_READ;
684 } else { 684 } else {
685 op = RUA_OP_WRITE; 685 op = RUA_OP_WRITE;
686 if (!async) { 686 if (!async) {
687 /* O_DIRECT not fully automatic yet */ 687 /* O_DIRECT not fully automatic yet */
688#ifdef HAS_ODIRECT 688#ifdef HAS_ODIRECT
689 if ((off & (DEV_BSIZE-1)) == 0 689 if ((off & (DEV_BSIZE-1)) == 0
690 && ((intptr_t)bp->b_data&(DEV_BSIZE-1)) == 0 690 && ((intptr_t)bp->b_data&(DEV_BSIZE-1)) == 0
691 && (bp->b_bcount & (DEV_BSIZE-1)) == 0) 691 && (bp->b_bcount & (DEV_BSIZE-1)) == 0)
692 fd = rblk->rblk_dfd; 692 fd = rblk->rblk_dfd;
693 else 693 else
694#endif 694#endif
695 op |= RUA_OP_SYNC; 695 op |= RUA_OP_SYNC;
696 } 696 }
697 } 697 }
698 698
699 rumpuser_mutex_enter(&rumpuser_aio_mtx); 699 rumpuser_mutex_enter(&rumpuser_aio_mtx);
700 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) { 700 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
701 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx); 701 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx);
702 } 702 }
703 703
704 rua = &rumpuser_aios[rumpuser_aio_head]; 704 rua = &rumpuser_aios[rumpuser_aio_head];
705 KASSERT(rua->rua_bp == NULL); 705 KASSERT(rua->rua_bp == NULL);
706 rua->rua_fd = fd; 706 rua->rua_fd = fd;
707 rua->rua_data = bp->b_data; 707 rua->rua_data = bp->b_data;
708 rua->rua_dlen = bp->b_bcount; 708 rua->rua_dlen = bp->b_bcount;
709 rua->rua_off = off; 709 rua->rua_off = off;
710 rua->rua_bp = bp; 710 rua->rua_bp = bp;
711 rua->rua_op = op; 711 rua->rua_op = op;
712 712
713 /* insert into queue & signal */ 713 /* insert into queue & signal */
714 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS; 714 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
715 rumpuser_cv_signal(&rumpuser_aio_cv); 715 rumpuser_cv_signal(&rumpuser_aio_cv);
716 rumpuser_mutex_exit(&rumpuser_aio_mtx); 716 rumpuser_mutex_exit(&rumpuser_aio_mtx);
717 } else { 717 } else {
718 if (BUF_ISREAD(bp)) { 718 if (BUF_ISREAD(bp)) {
719 rumpuser_read_bio(rblk->rblk_fd, bp->b_data, 719 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
720 bp->b_bcount, off, rump_biodone, bp); 720 bp->b_bcount, off, rump_biodone, bp);
721 } else { 721 } else {
722 rumpuser_write_bio(rblk->rblk_fd, bp->b_data, 722 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
723 bp->b_bcount, off, rump_biodone, bp); 723 bp->b_bcount, off, rump_biodone, bp);
724 } 724 }
725 if (BUF_ISWRITE(bp) && !async) 725 if (BUF_ISWRITE(bp) && !async)
726 rumpuser_fsync(rblk->rblk_fd, &error); 726 rumpuser_fsync(rblk->rblk_fd, &error);
727 } 727 }
728} 728}
729 729
730void 730void
731rumpblk_strategy(struct buf *bp) 731rumpblk_strategy(struct buf *bp)
732{ 732{
733 733
734 dostrategy(bp); 734 dostrategy(bp);
735} 735}
736 736
737/* 737/*
738 * Simple random number generator. This is private so that we can 738 * Simple random number generator. This is private so that we can
739 * very repeatedly control which blocks will fail. 739 * very repeatedly control which blocks will fail.
740 * 740 *
741 * <mlelstv> pooka, rand() 741 * <mlelstv> pooka, rand()
742 * <mlelstv> [paste] 742 * <mlelstv> [paste]
743 */ 743 */
744static unsigned 744static unsigned
745gimmerand(void) 745gimmerand(void)
746{ 746{
747 747
748 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); 748 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
749} 749}
750 750
751/* 751/*
752 * Block device with very simple fault injection. Fails every 752 * Block device with very simple fault injection. Fails every
753 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env 753 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
754 * variable RUMP_BLKFAIL. 754 * variable RUMP_BLKFAIL.
755 */ 755 */
756void 756void
757rumpblk_strategy_fail(struct buf *bp) 757rumpblk_strategy_fail(struct buf *bp)
758{ 758{
759 759
760 if (gimmerand() % BLKFAIL_MAX >= blkfail) { 760 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
761 dostrategy(bp); 761 dostrategy(bp);
762 } else {  762 } else {
763 printf("block fault injection: failing I/O on block %lld\n", 763 printf("block fault injection: failing I/O on block %lld\n",
764 (long long)bp->b_blkno); 764 (long long)bp->b_blkno);
765 bp->b_error = EIO; 765 bp->b_error = EIO;
766 biodone(bp); 766 biodone(bp);
767 } 767 }
768} 768}