| @@ -1,768 +1,768 @@ | | | @@ -1,768 +1,768 @@ |
1 | /* $NetBSD: rumpblk.c,v 1.32 2009/11/25 15:01:28 pooka Exp $ */ | | 1 | /* $NetBSD: rumpblk.c,v 1.33 2009/12/02 17:18:59 pooka Exp $ */ |
2 | | | 2 | |
3 | /* | | 3 | /* |
4 | * Copyright (c) 2009 Antti Kantee. All Rights Reserved. | | 4 | * Copyright (c) 2009 Antti Kantee. All Rights Reserved. |
5 | * | | 5 | * |
6 | * Development of this software was supported by the | | 6 | * Development of this software was supported by the |
7 | * Finnish Cultural Foundation. | | 7 | * Finnish Cultural Foundation. |
8 | * | | 8 | * |
9 | * Redistribution and use in source and binary forms, with or without | | 9 | * Redistribution and use in source and binary forms, with or without |
10 | * modification, are permitted provided that the following conditions | | 10 | * modification, are permitted provided that the following conditions |
11 | * are met: | | 11 | * are met: |
12 | * 1. Redistributions of source code must retain the above copyright | | 12 | * 1. Redistributions of source code must retain the above copyright |
13 | * notice, this list of conditions and the following disclaimer. | | 13 | * notice, this list of conditions and the following disclaimer. |
14 | * 2. Redistributions in binary form must reproduce the above copyright | | 14 | * 2. Redistributions in binary form must reproduce the above copyright |
15 | * notice, this list of conditions and the following disclaimer in the | | 15 | * notice, this list of conditions and the following disclaimer in the |
16 | * documentation and/or other materials provided with the distribution. | | 16 | * documentation and/or other materials provided with the distribution. |
17 | * | | 17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS | | 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS |
19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | | 19 | * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | | 20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | | 21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | | 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 24 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
28 | * SUCH DAMAGE. | | 28 | * SUCH DAMAGE. |
29 | */ | | 29 | */ |
30 | | | 30 | |
31 | /* | | 31 | /* |
32 | * Block device emulation. Presents a block device interface and | | 32 | * Block device emulation. Presents a block device interface and |
33 | * uses rumpuser system calls to satisfy I/O requests. | | 33 | * uses rumpuser system calls to satisfy I/O requests. |
34 | * | | 34 | * |
35 | * We provide fault injection. The driver can be made to fail | | 35 | * We provide fault injection. The driver can be made to fail |
36 | * I/O occasionally. | | 36 | * I/O occasionally. |
37 | * | | 37 | * |
38 | * The driver also provides an optimization for regular files by | | 38 | * The driver also provides an optimization for regular files by |
39 | * using memory-mapped I/O. This avoids kernel access for every | | 39 | * using memory-mapped I/O. This avoids kernel access for every |
40 | * I/O operation. It also gives finer-grained control of how to | | 40 | * I/O operation. It also gives finer-grained control of how to |
41 | * flush data. Additionally, in case the rump kernel dumps core, | | 41 | * flush data. Additionally, in case the rump kernel dumps core, |
42 | * we get way less carnage. | | 42 | * we get way less carnage. |
43 | * | | 43 | * |
44 | * However, it is quite costly in writing large amounts of | | 44 | * However, it is quite costly in writing large amounts of |
45 | * file data, since old contents cannot merely be overwritten, but | | 45 | * file data, since old contents cannot merely be overwritten, but |
46 | * must be paged in first before replacing (i.e. r/m/w). Ideally, | | 46 | * must be paged in first before replacing (i.e. r/m/w). Ideally, |
47 | * we should use directio. The problem is that directio can fail | | 47 | * we should use directio. The problem is that directio can fail |
48 | * silently causing improper file system semantics (i.e. unflushed | | 48 | * silently causing improper file system semantics (i.e. unflushed |
49 | * data). Therefore, default to mmap for now. Even so, directio | | 49 | * data). Therefore, default to mmap for now. Even so, directio |
50 | * _should_ be safe and can be enabled by compiling this module | | 50 | * _should_ be safe and can be enabled by compiling this module |
51 | * with -DHAS_DIRECTIO. | | 51 | * with -DHAS_DIRECTIO. |
52 | */ | | 52 | */ |
53 | | | 53 | |
54 | #include <sys/cdefs.h> | | 54 | #include <sys/cdefs.h> |
55 | __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.32 2009/11/25 15:01:28 pooka Exp $"); | | 55 | __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.33 2009/12/02 17:18:59 pooka Exp $"); |
56 | | | 56 | |
57 | #include <sys/param.h> | | 57 | #include <sys/param.h> |
58 | #include <sys/buf.h> | | 58 | #include <sys/buf.h> |
59 | #include <sys/conf.h> | | 59 | #include <sys/conf.h> |
60 | #include <sys/condvar.h> | | 60 | #include <sys/condvar.h> |
61 | #include <sys/disklabel.h> | | 61 | #include <sys/disklabel.h> |
62 | #include <sys/evcnt.h> | | 62 | #include <sys/evcnt.h> |
63 | #include <sys/fcntl.h> | | 63 | #include <sys/fcntl.h> |
64 | #include <sys/kmem.h> | | 64 | #include <sys/kmem.h> |
65 | #include <sys/malloc.h> | | 65 | #include <sys/malloc.h> |
66 | #include <sys/queue.h> | | 66 | #include <sys/queue.h> |
67 | #include <sys/stat.h> | | 67 | #include <sys/stat.h> |
68 | | | 68 | |
69 | #include <rump/rumpuser.h> | | 69 | #include <rump/rumpuser.h> |
70 | | | 70 | |
71 | #include "rump_private.h" | | 71 | #include "rump_private.h" |
72 | #include "rump_vfs_private.h" | | 72 | #include "rump_vfs_private.h" |
73 | | | 73 | |
74 | #if 0 | | 74 | #if 0 |
75 | #define DPRINTF(x) printf x | | 75 | #define DPRINTF(x) printf x |
76 | #else | | 76 | #else |
77 | #define DPRINTF(x) | | 77 | #define DPRINTF(x) |
78 | #endif | | 78 | #endif |
79 | | | 79 | |
80 | /* Default: 16 x 1MB windows */ | | 80 | /* Default: 16 x 1MB windows */ |
81 | unsigned memwinsize = (1<<20); | | 81 | unsigned memwinsize = (1<<20); |
82 | unsigned memwincnt = 16; | | 82 | unsigned memwincnt = 16; |
83 | | | 83 | |
84 | #define STARTWIN(off) ((off) & ~(memwinsize-1)) | | 84 | #define STARTWIN(off) ((off) & ~(memwinsize-1)) |
85 | #define INWIN(win,off) ((win)->win_off == STARTWIN(off)) | | 85 | #define INWIN(win,off) ((win)->win_off == STARTWIN(off)) |
86 | #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize)) | | 86 | #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize)) |
87 | #define WINVALID(win) ((win)->win_off != (off_t)-1) | | 87 | #define WINVALID(win) ((win)->win_off != (off_t)-1) |
88 | #define WINVALIDATE(win) ((win)->win_off = (off_t)-1) | | 88 | #define WINVALIDATE(win) ((win)->win_off = (off_t)-1) |
89 | struct blkwin { | | 89 | struct blkwin { |
90 | off_t win_off; | | 90 | off_t win_off; |
91 | void *win_mem; | | 91 | void *win_mem; |
92 | int win_refcnt; | | 92 | int win_refcnt; |
93 | | | 93 | |
94 | TAILQ_ENTRY(blkwin) win_lru; | | 94 | TAILQ_ENTRY(blkwin) win_lru; |
95 | }; | | 95 | }; |
96 | | | 96 | |
97 | #define RUMPBLK_SIZE 16 | | 97 | #define RUMPBLK_SIZE 16 |
98 | static struct rblkdev { | | 98 | static struct rblkdev { |
99 | char *rblk_path; | | 99 | char *rblk_path; |
100 | int rblk_fd; | | 100 | int rblk_fd; |
101 | int rblk_opencnt; | | 101 | int rblk_opencnt; |
102 | #ifdef HAS_ODIRECT | | 102 | #ifdef HAS_ODIRECT |
103 | int rblk_dfd; | | 103 | int rblk_dfd; |
104 | #endif | | 104 | #endif |
105 | uint64_t rblk_size; | | 105 | uint64_t rblk_size; |
106 | uint64_t rblk_hostoffset; | | 106 | uint64_t rblk_hostoffset; |
107 | int rblk_ftype; | | 107 | int rblk_ftype; |
108 | | | 108 | |
109 | /* for mmap */ | | 109 | /* for mmap */ |
110 | int rblk_mmflags; | | 110 | int rblk_mmflags; |
111 | kmutex_t rblk_memmtx; | | 111 | kmutex_t rblk_memmtx; |
112 | kcondvar_t rblk_memcv; | | 112 | kcondvar_t rblk_memcv; |
113 | TAILQ_HEAD(winlru, blkwin) rblk_lruq; | | 113 | TAILQ_HEAD(winlru, blkwin) rblk_lruq; |
114 | bool rblk_waiting; | | 114 | bool rblk_waiting; |
115 | | | 115 | |
116 | struct disklabel rblk_label; | | 116 | struct disklabel rblk_label; |
117 | } minors[RUMPBLK_SIZE]; | | 117 | } minors[RUMPBLK_SIZE]; |
118 | | | 118 | |
119 | static struct evcnt ev_io_total; | | 119 | static struct evcnt ev_io_total; |
120 | static struct evcnt ev_io_async; | | 120 | static struct evcnt ev_io_async; |
121 | | | 121 | |
122 | static struct evcnt ev_memblk_hits; | | 122 | static struct evcnt ev_memblk_hits; |
123 | static struct evcnt ev_memblk_busy; | | 123 | static struct evcnt ev_memblk_busy; |
124 | | | 124 | |
125 | static struct evcnt ev_bwrite_total; | | 125 | static struct evcnt ev_bwrite_total; |
126 | static struct evcnt ev_bwrite_async; | | 126 | static struct evcnt ev_bwrite_async; |
127 | static struct evcnt ev_bread_total; | | 127 | static struct evcnt ev_bread_total; |
128 | | | 128 | |
129 | dev_type_open(rumpblk_open); | | 129 | dev_type_open(rumpblk_open); |
130 | dev_type_close(rumpblk_close); | | 130 | dev_type_close(rumpblk_close); |
131 | dev_type_read(rumpblk_read); | | 131 | dev_type_read(rumpblk_read); |
132 | dev_type_write(rumpblk_write); | | 132 | dev_type_write(rumpblk_write); |
133 | dev_type_ioctl(rumpblk_ioctl); | | 133 | dev_type_ioctl(rumpblk_ioctl); |
134 | dev_type_strategy(rumpblk_strategy); | | 134 | dev_type_strategy(rumpblk_strategy); |
135 | dev_type_strategy(rumpblk_strategy_fail); | | 135 | dev_type_strategy(rumpblk_strategy_fail); |
136 | dev_type_dump(rumpblk_dump); | | 136 | dev_type_dump(rumpblk_dump); |
137 | dev_type_size(rumpblk_size); | | 137 | dev_type_size(rumpblk_size); |
138 | | | 138 | |
139 | static const struct bdevsw rumpblk_bdevsw = { | | 139 | static const struct bdevsw rumpblk_bdevsw = { |
140 | rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl, | | 140 | rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl, |
141 | nodump, nosize, D_DISK | | 141 | nodump, nosize, D_DISK |
142 | }; | | 142 | }; |
143 | | | 143 | |
144 | static const struct bdevsw rumpblk_bdevsw_fail = { | | 144 | static const struct bdevsw rumpblk_bdevsw_fail = { |
145 | rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl, | | 145 | rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl, |
146 | nodump, nosize, D_DISK | | 146 | nodump, nosize, D_DISK |
147 | }; | | 147 | }; |
148 | | | 148 | |
149 | static const struct cdevsw rumpblk_cdevsw = { | | 149 | static const struct cdevsw rumpblk_cdevsw = { |
150 | rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write, | | 150 | rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write, |
151 | rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK | | 151 | rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK |
152 | }; | | 152 | }; |
153 | | | 153 | |
154 | /* fail every n out of BLKFAIL_MAX */ | | 154 | /* fail every n out of BLKFAIL_MAX */ |
155 | #define BLKFAIL_MAX 10000 | | 155 | #define BLKFAIL_MAX 10000 |
156 | static int blkfail; | | 156 | static int blkfail; |
157 | static unsigned randstate; | | 157 | static unsigned randstate; |
158 | static kmutex_t rumpblk_lock; | | 158 | static kmutex_t rumpblk_lock; |
159 | | | 159 | |
160 | static void | | 160 | static void |
161 | makedefaultlabel(struct disklabel *lp, off_t size, int part) | | 161 | makedefaultlabel(struct disklabel *lp, off_t size, int part) |
162 | { | | 162 | { |
163 | int i; | | 163 | int i; |
164 | | | 164 | |
165 | memset(lp, 0, sizeof(*lp)); | | 165 | memset(lp, 0, sizeof(*lp)); |
166 | | | 166 | |
167 | lp->d_secperunit = size; | | 167 | lp->d_secperunit = size; |
168 | lp->d_secsize = DEV_BSIZE; | | 168 | lp->d_secsize = DEV_BSIZE; |
169 | lp->d_nsectors = size >> DEV_BSHIFT; | | 169 | lp->d_nsectors = size >> DEV_BSHIFT; |
170 | lp->d_ntracks = 1; | | 170 | lp->d_ntracks = 1; |
171 | lp->d_ncylinders = 1; | | 171 | lp->d_ncylinders = 1; |
172 | lp->d_secpercyl = lp->d_nsectors; | | 172 | lp->d_secpercyl = lp->d_nsectors; |
173 | | | 173 | |
174 | /* oh dear oh dear */ | | 174 | /* oh dear oh dear */ |
175 | strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); | | 175 | strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); |
176 | strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); | | 176 | strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); |
177 | | | 177 | |
178 | lp->d_type = DTYPE_RUMPD; | | 178 | lp->d_type = DTYPE_RUMPD; |
179 | lp->d_rpm = 11; | | 179 | lp->d_rpm = 11; |
180 | lp->d_interleave = 1; | | 180 | lp->d_interleave = 1; |
181 | lp->d_flags = 0; | | 181 | lp->d_flags = 0; |
182 | | | 182 | |
183 | /* XXX: RAW_PART handling? */ | | 183 | /* XXX: RAW_PART handling? */ |
184 | for (i = 0; i < part; i++) { | | 184 | for (i = 0; i < part; i++) { |
185 | lp->d_partitions[i].p_fstype = FS_UNUSED; | | 185 | lp->d_partitions[i].p_fstype = FS_UNUSED; |
186 | } | | 186 | } |
187 | lp->d_partitions[part].p_size = size >> DEV_BSHIFT; | | 187 | lp->d_partitions[part].p_size = size >> DEV_BSHIFT; |
188 | lp->d_npartitions = part+1; | | 188 | lp->d_npartitions = part+1; |
189 | /* XXX: file system type? */ | | 189 | /* XXX: file system type? */ |
190 | | | 190 | |
191 | lp->d_magic = DISKMAGIC; | | 191 | lp->d_magic = DISKMAGIC; |
192 | lp->d_magic2 = DISKMAGIC; | | 192 | lp->d_magic2 = DISKMAGIC; |
193 | lp->d_checksum = 0; /* XXX */ | | 193 | lp->d_checksum = 0; /* XXX */ |
194 | } | | 194 | } |
195 | | | 195 | |
196 | static struct blkwin * | | 196 | static struct blkwin * |
197 | getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error) | | 197 | getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error) |
198 | { | | 198 | { |
199 | struct blkwin *win; | | 199 | struct blkwin *win; |
200 | | | 200 | |
201 | mutex_enter(&rblk->rblk_memmtx); | | 201 | mutex_enter(&rblk->rblk_memmtx); |
202 | retry: | | 202 | retry: |
203 | /* search for window */ | | 203 | /* search for window */ |
204 | TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) { | | 204 | TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) { |
205 | if (INWIN(win, off) && WINVALID(win)) | | 205 | if (INWIN(win, off) && WINVALID(win)) |
206 | break; | | 206 | break; |
207 | } | | 207 | } |
208 | | | 208 | |
209 | /* found? return */ | | 209 | /* found? return */ |
210 | if (win) { | | 210 | if (win) { |
211 | ev_memblk_hits.ev_count++; | | 211 | ev_memblk_hits.ev_count++; |
212 | TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); | | 212 | TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); |
213 | goto good; | | 213 | goto good; |
214 | } | | 214 | } |
215 | | | 215 | |
216 | /* | | 216 | /* |
217 | * Else, create new window. If the least recently used is not | | 217 | * Else, create new window. If the least recently used is not |
218 | * currently in use, reuse that. Otherwise we need to wait. | | 218 | * currently in use, reuse that. Otherwise we need to wait. |
219 | */ | | 219 | */ |
220 | win = TAILQ_LAST(&rblk->rblk_lruq, winlru); | | 220 | win = TAILQ_LAST(&rblk->rblk_lruq, winlru); |
221 | if (win->win_refcnt == 0) { | | 221 | if (win->win_refcnt == 0) { |
222 | TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); | | 222 | TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); |
223 | mutex_exit(&rblk->rblk_memmtx); | | 223 | mutex_exit(&rblk->rblk_memmtx); |
224 | | | 224 | |
225 | if (WINVALID(win)) { | | 225 | if (WINVALID(win)) { |
226 | DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n", | | 226 | DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n", |
227 | win, win->win_mem, win->win_off)); | | 227 | win, win->win_mem, win->win_off)); |
228 | rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); | | 228 | rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); |
229 | WINVALIDATE(win); | | 229 | WINVALIDATE(win); |
230 | } | | 230 | } |
231 | | | 231 | |
232 | win->win_off = STARTWIN(off); | | 232 | win->win_off = STARTWIN(off); |
233 | win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off, | | 233 | win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off, |
234 | WINSIZE(rblk, win), rblk->rblk_mmflags, error); | | 234 | WINSIZE(rblk, win), rblk->rblk_mmflags, error); |
235 | DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n", | | 235 | DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n", |
236 | win, win->win_off, win->win_mem)); | | 236 | win, win->win_off, win->win_mem)); |
237 | | | 237 | |
238 | mutex_enter(&rblk->rblk_memmtx); | | 238 | mutex_enter(&rblk->rblk_memmtx); |
239 | if (win->win_mem == NULL) { | | 239 | if (win->win_mem == NULL) { |
240 | WINVALIDATE(win); | | 240 | WINVALIDATE(win); |
241 | TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); | | 241 | TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); |
242 | mutex_exit(&rblk->rblk_memmtx); | | 242 | mutex_exit(&rblk->rblk_memmtx); |
243 | return NULL; | | 243 | return NULL; |
244 | } | | 244 | } |
245 | } else { | | 245 | } else { |
246 | DPRINTF(("memwin wait\n")); | | 246 | DPRINTF(("memwin wait\n")); |
247 | ev_memblk_busy.ev_count++; | | 247 | ev_memblk_busy.ev_count++; |
248 | | | 248 | |
249 | rblk->rblk_waiting = true; | | 249 | rblk->rblk_waiting = true; |
250 | cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx); | | 250 | cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx); |
251 | goto retry; | | 251 | goto retry; |
252 | } | | 252 | } |
253 | | | 253 | |
254 | good: | | 254 | good: |
255 | KASSERT(win); | | 255 | KASSERT(win); |
256 | win->win_refcnt++; | | 256 | win->win_refcnt++; |
257 | TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru); | | 257 | TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru); |
258 | mutex_exit(&rblk->rblk_memmtx); | | 258 | mutex_exit(&rblk->rblk_memmtx); |
259 | *wsize = MIN(*wsize, memwinsize - (off-win->win_off)); | | 259 | *wsize = MIN(*wsize, memwinsize - (off-win->win_off)); |
260 | KASSERT(*wsize); | | 260 | KASSERT(*wsize); |
261 | | | 261 | |
262 | return win; | | 262 | return win; |
263 | } | | 263 | } |
264 | | | 264 | |
265 | static void | | 265 | static void |
266 | putwindow(struct rblkdev *rblk, struct blkwin *win) | | 266 | putwindow(struct rblkdev *rblk, struct blkwin *win) |
267 | { | | 267 | { |
268 | | | 268 | |
269 | mutex_enter(&rblk->rblk_memmtx); | | 269 | mutex_enter(&rblk->rblk_memmtx); |
270 | if (--win->win_refcnt == 0 && rblk->rblk_waiting) { | | 270 | if (--win->win_refcnt == 0 && rblk->rblk_waiting) { |
271 | rblk->rblk_waiting = false; | | 271 | rblk->rblk_waiting = false; |
272 | cv_signal(&rblk->rblk_memcv); | | 272 | cv_signal(&rblk->rblk_memcv); |
273 | } | | 273 | } |
274 | KASSERT(win->win_refcnt >= 0); | | 274 | KASSERT(win->win_refcnt >= 0); |
275 | mutex_exit(&rblk->rblk_memmtx); | | 275 | mutex_exit(&rblk->rblk_memmtx); |
276 | } | | 276 | } |
277 | | | 277 | |
278 | static void | | 278 | static void |
279 | wincleanup(struct rblkdev *rblk) | | 279 | wincleanup(struct rblkdev *rblk) |
280 | { | | 280 | { |
281 | struct blkwin *win; | | 281 | struct blkwin *win; |
282 | | | 282 | |
283 | while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) { | | 283 | while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) { |
284 | TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); | | 284 | TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); |
285 | if (WINVALID(win)) { | | 285 | if (WINVALID(win)) { |
286 | DPRINTF(("cleanup win %p addr %p\n", | | 286 | DPRINTF(("cleanup win %p addr %p\n", |
287 | win, win->win_mem)); | | 287 | win, win->win_mem)); |
288 | rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); | | 288 | rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); |
289 | } | | 289 | } |
290 | kmem_free(win, sizeof(*win)); | | 290 | kmem_free(win, sizeof(*win)); |
291 | } | | 291 | } |
292 | rblk->rblk_mmflags = 0; | | 292 | rblk->rblk_mmflags = 0; |
293 | } | | 293 | } |
294 | | | 294 | |
295 | int | | 295 | int |
296 | rumpblk_init(void) | | 296 | rumpblk_init(void) |
297 | { | | 297 | { |
298 | char buf[64]; | | 298 | char buf[64]; |
299 | int rumpblk = RUMPBLK; | | 299 | int rumpblk = RUMPBLK; |
300 | unsigned tmp; | | 300 | unsigned tmp; |
301 | int error, i; | | 301 | int error, i; |
302 | | | 302 | |
303 | mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); | | 303 | mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); |
304 | | | 304 | |
305 | if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) { | | 305 | if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) { |
306 | blkfail = strtoul(buf, NULL, 10); | | 306 | blkfail = strtoul(buf, NULL, 10); |
307 | /* fail everything */ | | 307 | /* fail everything */ |
308 | if (blkfail > BLKFAIL_MAX) | | 308 | if (blkfail > BLKFAIL_MAX) |
309 | blkfail = BLKFAIL_MAX; | | 309 | blkfail = BLKFAIL_MAX; |
310 | if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf), | | 310 | if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf), |
311 | &error) == 0) { | | 311 | &error) == 0) { |
312 | randstate = strtoul(buf, NULL, 10); | | 312 | randstate = strtoul(buf, NULL, 10); |
313 | } else { | | 313 | } else { |
314 | randstate = arc4random(); | | 314 | randstate = arc4random(); |
315 | } | | 315 | } |
316 | printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " | | 316 | printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " |
317 | "seed %u\n", blkfail, BLKFAIL_MAX, randstate); | | 317 | "seed %u\n", blkfail, BLKFAIL_MAX, randstate); |
318 | } else { | | 318 | } else { |
319 | blkfail = 0; | | 319 | blkfail = 0; |
320 | } | | 320 | } |
321 | | | 321 | |
322 | if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) { | | 322 | if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) { |
323 | printf("rumpblk: "); | | 323 | printf("rumpblk: "); |
324 | tmp = strtoul(buf, NULL, 10); | | 324 | tmp = strtoul(buf, NULL, 10); |
325 | if (tmp && !(tmp & (tmp-1))) | | 325 | if (tmp && !(tmp & (tmp-1))) |
326 | memwinsize = tmp; | | 326 | memwinsize = tmp; |
327 | else | | 327 | else |
328 | printf("invalid RUMP_BLKWINSIZE %d, ", tmp); | | 328 | printf("invalid RUMP_BLKWINSIZE %d, ", tmp); |
329 | printf("using %d for memwinsize\n", memwinsize); | | 329 | printf("using %d for memwinsize\n", memwinsize); |
330 | } | | 330 | } |
331 | if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){ | | 331 | if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){ |
332 | printf("rumpblk: "); | | 332 | printf("rumpblk: "); |
333 | tmp = strtoul(buf, NULL, 10); | | 333 | tmp = strtoul(buf, NULL, 10); |
334 | if (tmp) | | 334 | if (tmp) |
335 | memwincnt = tmp; | | 335 | memwincnt = tmp; |
336 | else | | 336 | else |
337 | printf("invalid RUMP_BLKWINCOUNT %d, ", tmp); | | 337 | printf("invalid RUMP_BLKWINCOUNT %d, ", tmp); |
338 | printf("using %d for memwincount\n", memwincnt); | | 338 | printf("using %d for memwincount\n", memwincnt); |
339 | } | | 339 | } |
340 | | | 340 | |
341 | memset(minors, 0, sizeof(minors)); | | 341 | memset(minors, 0, sizeof(minors)); |
342 | for (i = 0; i < RUMPBLK_SIZE; i++) { | | 342 | for (i = 0; i < RUMPBLK_SIZE; i++) { |
343 | mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE); | | 343 | mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE); |
344 | cv_init(&minors[i].rblk_memcv, "rblkmcv"); | | 344 | cv_init(&minors[i].rblk_memcv, "rblkmcv"); |
345 | } | | 345 | } |
346 | | | 346 | |
347 | evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, | | 347 | evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, |
348 | "rumpblk", "rumpblk I/O reqs"); | | 348 | "rumpblk", "rumpblk I/O reqs"); |
349 | evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, | | 349 | evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, |
350 | "rumpblk", "rumpblk async I/O"); | | 350 | "rumpblk", "rumpblk async I/O"); |
351 | | | 351 | |
352 | evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, | | 352 | evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, |
353 | "rumpblk", "rumpblk bytes read"); | | 353 | "rumpblk", "rumpblk bytes read"); |
354 | evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, | | 354 | evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, |
355 | "rumpblk", "rumpblk bytes written"); | | 355 | "rumpblk", "rumpblk bytes written"); |
356 | evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, | | 356 | evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, |
357 | "rumpblk", "rumpblk bytes written async"); | | 357 | "rumpblk", "rumpblk bytes written async"); |
358 | | | 358 | |
359 | evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL, | | 359 | evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL, |
360 | "rumpblk", "memblk window hits"); | | 360 | "rumpblk", "memblk window hits"); |
361 | evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL, | | 361 | evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL, |
362 | "rumpblk", "memblk all windows busy"); | | 362 | "rumpblk", "memblk all windows busy"); |
363 | | | 363 | |
364 | if (blkfail) { | | 364 | if (blkfail) { |
365 | return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk, | | 365 | return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk, |
366 | &rumpblk_cdevsw, &rumpblk); | | 366 | &rumpblk_cdevsw, &rumpblk); |
367 | } else { | | 367 | } else { |
368 | return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk, | | 368 | return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk, |
369 | &rumpblk_cdevsw, &rumpblk); | | 369 | &rumpblk_cdevsw, &rumpblk); |
370 | } | | 370 | } |
371 | } | | 371 | } |
372 | | | 372 | |
373 | /* XXX: no deregister */ | | 373 | /* XXX: no deregister */ |
374 | int | | 374 | int |
375 | rumpblk_register(const char *path, devminor_t *dmin, | | 375 | rumpblk_register(const char *path, devminor_t *dmin, |
376 | uint64_t offset, uint64_t size) | | 376 | uint64_t offset, uint64_t size) |
377 | { | | 377 | { |
378 | struct rblkdev *rblk; | | 378 | struct rblkdev *rblk; |
379 | uint64_t flen; | | 379 | uint64_t flen; |
380 | size_t len; | | 380 | size_t len; |
381 | int ftype, error, i; | | 381 | int ftype, error, i; |
382 | | | 382 | |
383 | /* devices might not report correct size unless they're open */ | | 383 | /* devices might not report correct size unless they're open */ |
384 | if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1) | | 384 | if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1) |
385 | return error; | | 385 | return error; |
386 | | | 386 | |
387 | /* verify host file is of supported type */ | | 387 | /* verify host file is of supported type */ |
388 | if (!(ftype == RUMPUSER_FT_REG | | 388 | if (!(ftype == RUMPUSER_FT_REG |
389 | || ftype == RUMPUSER_FT_BLK | | 389 | || ftype == RUMPUSER_FT_BLK |
390 | || ftype == RUMPUSER_FT_CHR)) | | 390 | || ftype == RUMPUSER_FT_CHR)) |
391 | return EINVAL; | | 391 | return EINVAL; |
392 | | | 392 | |
393 | mutex_enter(&rumpblk_lock); | | 393 | mutex_enter(&rumpblk_lock); |
394 | for (i = 0; i < RUMPBLK_SIZE; i++) { | | 394 | for (i = 0; i < RUMPBLK_SIZE; i++) { |
395 | if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { | | 395 | if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { |
396 | mutex_exit(&rumpblk_lock); | | 396 | mutex_exit(&rumpblk_lock); |
397 | *dmin = i; | | 397 | *dmin = i; |
398 | return 0; | | 398 | return 0; |
399 | } | | 399 | } |
400 | } | | 400 | } |
401 | | | 401 | |
402 | for (i = 0; i < RUMPBLK_SIZE; i++) | | 402 | for (i = 0; i < RUMPBLK_SIZE; i++) |
403 | if (minors[i].rblk_path == NULL) | | 403 | if (minors[i].rblk_path == NULL) |
404 | break; | | 404 | break; |
405 | if (i == RUMPBLK_SIZE) { | | 405 | if (i == RUMPBLK_SIZE) { |
406 | mutex_exit(&rumpblk_lock); | | 406 | mutex_exit(&rumpblk_lock); |
407 | return EBUSY; | | 407 | return EBUSY; |
408 | } | | 408 | } |
409 | | | 409 | |
410 | rblk = &minors[i]; | | 410 | rblk = &minors[i]; |
411 | len = strlen(path); | | 411 | len = strlen(path); |
412 | rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); | | 412 | rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); |
413 | strcpy(rblk->rblk_path, path); | | 413 | strcpy(rblk->rblk_path, path); |
414 | rblk->rblk_fd = -1; | | 414 | rblk->rblk_fd = -1; |
415 | rblk->rblk_hostoffset = offset; | | 415 | rblk->rblk_hostoffset = offset; |
416 | if (size == RUMPBLK_SIZENOTSET) { | | 416 | if (size != RUMPBLK_SIZENOTSET) { |
417 | KASSERT(size + offset <= flen); | | 417 | KASSERT(size + offset <= flen); |
418 | rblk->rblk_size = size; | | 418 | rblk->rblk_size = size; |
419 | } else { | | 419 | } else { |
420 | KASSERT(offset < flen); | | 420 | KASSERT(offset < flen); |
421 | rblk->rblk_size = flen - offset; | | 421 | rblk->rblk_size = flen - offset; |
422 | } | | 422 | } |
423 | rblk->rblk_ftype = ftype; | | 423 | rblk->rblk_ftype = ftype; |
424 | makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); | | 424 | makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); |
425 | mutex_exit(&rumpblk_lock); | | 425 | mutex_exit(&rumpblk_lock); |
426 | | | 426 | |
427 | *dmin = i; | | 427 | *dmin = i; |
428 | return 0; | | 428 | return 0; |
429 | } | | 429 | } |
430 | | | 430 | |
431 | int | | 431 | int |
432 | rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) | | 432 | rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) |
433 | { | | 433 | { |
434 | struct rblkdev *rblk = &minors[minor(dev)]; | | 434 | struct rblkdev *rblk = &minors[minor(dev)]; |
435 | int error, fd; | | 435 | int error, fd; |
436 | | | 436 | |
437 | if (rblk->rblk_path == NULL) | | 437 | if (rblk->rblk_path == NULL) |
438 | return ENXIO; | | 438 | return ENXIO; |
439 | | | 439 | |
440 | if (rblk->rblk_fd != -1) | | 440 | if (rblk->rblk_fd != -1) |
441 | return 0; /* XXX: refcount, open mode */ | | 441 | return 0; /* XXX: refcount, open mode */ |
442 | fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error); | | 442 | fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error); |
443 | if (error) | | 443 | if (error) |
444 | return error; | | 444 | return error; |
445 | | | 445 | |
446 | #ifdef HAS_ODIRECT | | 446 | #ifdef HAS_ODIRECT |
447 | rblk->rblk_dfd = rumpuser_open(rblk->rblk_path, | | 447 | rblk->rblk_dfd = rumpuser_open(rblk->rblk_path, |
448 | OFLAGS(flag) | O_DIRECT, &error); | | 448 | OFLAGS(flag) | O_DIRECT, &error); |
449 | if (error) | | 449 | if (error) |
450 | return error; | | 450 | return error; |
451 | #endif | | 451 | #endif |
452 | | | 452 | |
453 | if (rblk->rblk_ftype == RUMPUSER_FT_REG) { | | 453 | if (rblk->rblk_ftype == RUMPUSER_FT_REG) { |
454 | uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset; | | 454 | uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset; |
455 | struct blkwin *win; | | 455 | struct blkwin *win; |
456 | int i, winsize; | | 456 | int i, winsize; |
457 | | | 457 | |
458 | /* | | 458 | /* |
459 | * Use mmap to access a regular file. Allocate and | | 459 | * Use mmap to access a regular file. Allocate and |
460 | * cache initial windows here. Failure to allocate one | | 460 | * cache initial windows here. Failure to allocate one |
461 | * means fallback to read/write i/o. | | 461 | * means fallback to read/write i/o. |
462 | */ | | 462 | */ |
463 | | | 463 | |
464 | rblk->rblk_mmflags = 0; | | 464 | rblk->rblk_mmflags = 0; |
465 | if (flag & FREAD) | | 465 | if (flag & FREAD) |
466 | rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ; | | 466 | rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ; |
467 | if (flag & FWRITE) { | | 467 | if (flag & FWRITE) { |
468 | rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE; | | 468 | rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE; |
469 | rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED; | | 469 | rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED; |
470 | } | | 470 | } |
471 | | | 471 | |
472 | TAILQ_INIT(&rblk->rblk_lruq); | | 472 | TAILQ_INIT(&rblk->rblk_lruq); |
473 | rblk->rblk_fd = fd; | | 473 | rblk->rblk_fd = fd; |
474 | | | 474 | |
475 | for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) { | | 475 | for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) { |
476 | win = kmem_zalloc(sizeof(*win), KM_SLEEP); | | 476 | win = kmem_zalloc(sizeof(*win), KM_SLEEP); |
477 | WINVALIDATE(win); | | 477 | WINVALIDATE(win); |
478 | TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); | | 478 | TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); |
479 | | | 479 | |
480 | /* | | 480 | /* |
481 | * Allocate first windows. Here we just generally | | 481 | * Allocate first windows. Here we just generally |
482 | * make sure a) we can mmap at all b) we have the | | 482 | * make sure a) we can mmap at all b) we have the |
483 | * necessary VA available | | 483 | * necessary VA available |
484 | */ | | 484 | */ |
485 | winsize = memwinsize; | | 485 | winsize = memwinsize; |
486 | win = getwindow(rblk, off + i*memwinsize, &winsize, | | 486 | win = getwindow(rblk, off + i*memwinsize, &winsize, |
487 | &error); | | 487 | &error); |
488 | if (win) { | | 488 | if (win) { |
489 | putwindow(rblk, win); | | 489 | putwindow(rblk, win); |
490 | } else { | | 490 | } else { |
491 | wincleanup(rblk); | | 491 | wincleanup(rblk); |
492 | break; | | 492 | break; |
493 | } | | 493 | } |
494 | } | | 494 | } |
495 | } | | 495 | } |
496 | | | 496 | |
497 | KASSERT(rblk->rblk_fd != -1); | | 497 | KASSERT(rblk->rblk_fd != -1); |
498 | return 0; | | 498 | return 0; |
499 | } | | 499 | } |
500 | | | 500 | |
501 | int | | 501 | int |
502 | rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) | | 502 | rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) |
503 | { | | 503 | { |
504 | struct rblkdev *rblk = &minors[minor(dev)]; | | 504 | struct rblkdev *rblk = &minors[minor(dev)]; |
505 | int dummy; | | 505 | int dummy; |
506 | | | 506 | |
507 | if (rblk->rblk_mmflags) | | 507 | if (rblk->rblk_mmflags) |
508 | wincleanup(rblk); | | 508 | wincleanup(rblk); |
509 | rumpuser_fsync(rblk->rblk_fd, &dummy); | | 509 | rumpuser_fsync(rblk->rblk_fd, &dummy); |
510 | rumpuser_close(rblk->rblk_fd, &dummy); | | 510 | rumpuser_close(rblk->rblk_fd, &dummy); |
511 | rblk->rblk_fd = -1; | | 511 | rblk->rblk_fd = -1; |
512 | | | 512 | |
513 | return 0; | | 513 | return 0; |
514 | } | | 514 | } |
515 | | | 515 | |
516 | int | | 516 | int |
517 | rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) | | 517 | rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) |
518 | { | | 518 | { |
519 | devminor_t dmin = minor(dev); | | 519 | devminor_t dmin = minor(dev); |
520 | struct rblkdev *rblk = &minors[dmin]; | | 520 | struct rblkdev *rblk = &minors[dmin]; |
521 | struct partinfo *pi; | | 521 | struct partinfo *pi; |
522 | int error = 0; | | 522 | int error = 0; |
523 | | | 523 | |
524 | /* well, me should support a few more, but we don't for now */ | | 524 | /* well, me should support a few more, but we don't for now */ |
525 | switch (xfer) { | | 525 | switch (xfer) { |
526 | case DIOCGDINFO: | | 526 | case DIOCGDINFO: |
527 | *(struct disklabel *)addr = rblk->rblk_label; | | 527 | *(struct disklabel *)addr = rblk->rblk_label; |
528 | break; | | 528 | break; |
529 | | | 529 | |
530 | case DIOCGPART: | | 530 | case DIOCGPART: |
531 | pi = addr; | | 531 | pi = addr; |
532 | pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; | | 532 | pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; |
533 | pi->disklab = &rblk->rblk_label; | | 533 | pi->disklab = &rblk->rblk_label; |
534 | break; | | 534 | break; |
535 | | | 535 | |
536 | /* it's synced enough along the write path */ | | 536 | /* it's synced enough along the write path */ |
537 | case DIOCCACHESYNC: | | 537 | case DIOCCACHESYNC: |
538 | break; | | 538 | break; |
539 | | | 539 | |
540 | default: | | 540 | default: |
541 | error = ENOTTY; | | 541 | error = ENOTTY; |
542 | break; | | 542 | break; |
543 | } | | 543 | } |
544 | | | 544 | |
545 | return error; | | 545 | return error; |
546 | } | | 546 | } |
547 | | | 547 | |
548 | static int | | 548 | static int |
549 | do_physio(dev_t dev, struct uio *uio, int which) | | 549 | do_physio(dev_t dev, struct uio *uio, int which) |
550 | { | | 550 | { |
551 | void (*strat)(struct buf *); | | 551 | void (*strat)(struct buf *); |
552 | | | 552 | |
553 | if (blkfail) | | 553 | if (blkfail) |
554 | strat = rumpblk_strategy_fail; | | 554 | strat = rumpblk_strategy_fail; |
555 | else | | 555 | else |
556 | strat = rumpblk_strategy; | | 556 | strat = rumpblk_strategy; |
557 | | | 557 | |
558 | return physio(strat, NULL, dev, which, minphys, uio); | | 558 | return physio(strat, NULL, dev, which, minphys, uio); |
559 | } | | 559 | } |
560 | | | 560 | |
561 | int | | 561 | int |
562 | rumpblk_read(dev_t dev, struct uio *uio, int flags) | | 562 | rumpblk_read(dev_t dev, struct uio *uio, int flags) |
563 | { | | 563 | { |
564 | | | 564 | |
565 | return do_physio(dev, uio, B_READ); | | 565 | return do_physio(dev, uio, B_READ); |
566 | } | | 566 | } |
567 | | | 567 | |
568 | int | | 568 | int |
569 | rumpblk_write(dev_t dev, struct uio *uio, int flags) | | 569 | rumpblk_write(dev_t dev, struct uio *uio, int flags) |
570 | { | | 570 | { |
571 | | | 571 | |
572 | return do_physio(dev, uio, B_WRITE); | | 572 | return do_physio(dev, uio, B_WRITE); |
573 | } | | 573 | } |
574 | | | 574 | |
575 | static void | | 575 | static void |
576 | dostrategy(struct buf *bp) | | 576 | dostrategy(struct buf *bp) |
577 | { | | 577 | { |
578 | struct rblkdev *rblk = &minors[minor(bp->b_dev)]; | | 578 | struct rblkdev *rblk = &minors[minor(bp->b_dev)]; |
579 | off_t off; | | 579 | off_t off; |
580 | int async = bp->b_flags & B_ASYNC; | | 580 | int async = bp->b_flags & B_ASYNC; |
581 | int error; | | 581 | int error; |
582 | | | 582 | |
583 | /* collect statistics */ | | 583 | /* collect statistics */ |
584 | ev_io_total.ev_count++; | | 584 | ev_io_total.ev_count++; |
585 | if (async) | | 585 | if (async) |
586 | ev_io_async.ev_count++; | | 586 | ev_io_async.ev_count++; |
587 | if (BUF_ISWRITE(bp)) { | | 587 | if (BUF_ISWRITE(bp)) { |
588 | ev_bwrite_total.ev_count += bp->b_bcount; | | 588 | ev_bwrite_total.ev_count += bp->b_bcount; |
589 | if (async) | | 589 | if (async) |
590 | ev_bwrite_async.ev_count += bp->b_bcount; | | 590 | ev_bwrite_async.ev_count += bp->b_bcount; |
591 | } else { | | 591 | } else { |
592 | ev_bread_total.ev_count++; | | 592 | ev_bread_total.ev_count++; |
593 | } | | 593 | } |
594 | | | 594 | |
595 | off = bp->b_blkno << DEV_BSHIFT; | | 595 | off = bp->b_blkno << DEV_BSHIFT; |
596 | off += rblk->rblk_hostoffset; | | 596 | off += rblk->rblk_hostoffset; |
597 | /* | | 597 | /* |
598 | * Do bounds checking if we're working on a file. Otherwise | | 598 | * Do bounds checking if we're working on a file. Otherwise |
599 | * invalid file systems might attempt to read beyond EOF. This | | 599 | * invalid file systems might attempt to read beyond EOF. This |
600 | * is bad(tm) especially on mmapped images. This is essentially | | 600 | * is bad(tm) especially on mmapped images. This is essentially |
601 | * the kernel bounds_check() routines. | | 601 | * the kernel bounds_check() routines. |
602 | */ | | 602 | */ |
603 | if (off + bp->b_bcount > rblk->rblk_size) { | | 603 | if (off + bp->b_bcount > rblk->rblk_size) { |
604 | int64_t sz = rblk->rblk_size - off; | | 604 | int64_t sz = rblk->rblk_size - off; |
605 | | | 605 | |
606 | /* EOF */ | | 606 | /* EOF */ |
607 | if (sz == 0) { | | 607 | if (sz == 0) { |
608 | rump_biodone(bp, 0, 0); | | 608 | rump_biodone(bp, 0, 0); |
609 | return; | | 609 | return; |
610 | } | | 610 | } |
611 | /* beyond EOF ==> error */ | | 611 | /* beyond EOF ==> error */ |
612 | if (sz < 0) { | | 612 | if (sz < 0) { |
613 | rump_biodone(bp, 0, EINVAL); | | 613 | rump_biodone(bp, 0, EINVAL); |
614 | return; | | 614 | return; |
615 | } | | 615 | } |
616 | | | 616 | |
617 | /* truncate to device size */ | | 617 | /* truncate to device size */ |
618 | bp->b_bcount = sz; | | 618 | bp->b_bcount = sz; |
619 | } | | 619 | } |
620 | | | 620 | |
621 | DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 | | 621 | DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 |
622 | " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", | | 622 | " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", |
623 | bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", | | 623 | bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", |
624 | off, off, (off + bp->b_bcount), async ? "a" : "")); | | 624 | off, off, (off + bp->b_bcount), async ? "a" : "")); |
625 | | | 625 | |
626 | /* mmap? handle here and return */ | | 626 | /* mmap? handle here and return */ |
627 | if (rblk->rblk_mmflags) { | | 627 | if (rblk->rblk_mmflags) { |
628 | struct blkwin *win; | | 628 | struct blkwin *win; |
629 | int winsize, iodone; | | 629 | int winsize, iodone; |
630 | uint8_t *ioaddr, *bufaddr; | | 630 | uint8_t *ioaddr, *bufaddr; |
631 | | | 631 | |
632 | for (iodone = 0; iodone < bp->b_bcount; | | 632 | for (iodone = 0; iodone < bp->b_bcount; |
633 | iodone += winsize, off += winsize) { | | 633 | iodone += winsize, off += winsize) { |
634 | winsize = bp->b_bcount - iodone; | | 634 | winsize = bp->b_bcount - iodone; |
635 | win = getwindow(rblk, off, &winsize, &error); | | 635 | win = getwindow(rblk, off, &winsize, &error); |
636 | if (win == NULL) { | | 636 | if (win == NULL) { |
637 | rump_biodone(bp, iodone, error); | | 637 | rump_biodone(bp, iodone, error); |
638 | return; | | 638 | return; |
639 | } | | 639 | } |
640 | | | 640 | |
641 | ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off)); | | 641 | ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off)); |
642 | bufaddr = (uint8_t *)bp->b_data + iodone; | | 642 | bufaddr = (uint8_t *)bp->b_data + iodone; |
643 | | | 643 | |
644 | DPRINTF(("strat: %p off 0x%" PRIx64 | | 644 | DPRINTF(("strat: %p off 0x%" PRIx64 |
645 | ", ioaddr %p (%p)/buf %p\n", win, | | 645 | ", ioaddr %p (%p)/buf %p\n", win, |
646 | win->win_off, ioaddr, win->win_mem, bufaddr)); | | 646 | win->win_off, ioaddr, win->win_mem, bufaddr)); |
647 | if (BUF_ISREAD(bp)) { | | 647 | if (BUF_ISREAD(bp)) { |
648 | memcpy(bufaddr, ioaddr, winsize); | | 648 | memcpy(bufaddr, ioaddr, winsize); |
649 | } else { | | 649 | } else { |
650 | memcpy(ioaddr, bufaddr, winsize); | | 650 | memcpy(ioaddr, bufaddr, winsize); |
651 | } | | 651 | } |
652 | | | 652 | |
653 | /* synchronous write, sync bits back to disk */ | | 653 | /* synchronous write, sync bits back to disk */ |
654 | if (BUF_ISWRITE(bp) && !async) { | | 654 | if (BUF_ISWRITE(bp) && !async) { |
655 | rumpuser_memsync(ioaddr, winsize, &error); | | 655 | rumpuser_memsync(ioaddr, winsize, &error); |
656 | } | | 656 | } |
657 | putwindow(rblk, win); | | 657 | putwindow(rblk, win); |
658 | } | | 658 | } |
659 | | | 659 | |
660 | rump_biodone(bp, bp->b_bcount, 0); | | 660 | rump_biodone(bp, bp->b_bcount, 0); |
661 | return; | | 661 | return; |
662 | } | | 662 | } |
663 | | | 663 | |
664 | /* | | 664 | /* |
665 | * Do I/O. We have different paths for async and sync I/O. | | 665 | * Do I/O. We have different paths for async and sync I/O. |
666 | * Async I/O is done by passing a request to rumpuser where | | 666 | * Async I/O is done by passing a request to rumpuser where |
667 | * it is executed. The rumpuser routine then calls | | 667 | * it is executed. The rumpuser routine then calls |
668 | * biodone() to signal any waiters in the kernel. I/O's are | | 668 | * biodone() to signal any waiters in the kernel. I/O's are |
669 | * executed in series. Technically executing them in parallel | | 669 | * executed in series. Technically executing them in parallel |
670 | * would produce better results, but then we'd need either | | 670 | * would produce better results, but then we'd need either |
671 | * more threads or posix aio. Maybe worth investigating | | 671 | * more threads or posix aio. Maybe worth investigating |
672 | * this later. | | 672 | * this later. |
673 | * | | 673 | * |
674 | * Using bufq here might be a good idea. | | 674 | * Using bufq here might be a good idea. |
675 | */ | | 675 | */ |
676 | | | 676 | |
677 | if (rump_threads) { | | 677 | if (rump_threads) { |
678 | struct rumpuser_aio *rua; | | 678 | struct rumpuser_aio *rua; |
679 | int op, fd; | | 679 | int op, fd; |
680 | | | 680 | |
681 | fd = rblk->rblk_fd; | | 681 | fd = rblk->rblk_fd; |
682 | if (BUF_ISREAD(bp)) { | | 682 | if (BUF_ISREAD(bp)) { |
683 | op = RUA_OP_READ; | | 683 | op = RUA_OP_READ; |
684 | } else { | | 684 | } else { |
685 | op = RUA_OP_WRITE; | | 685 | op = RUA_OP_WRITE; |
686 | if (!async) { | | 686 | if (!async) { |
687 | /* O_DIRECT not fully automatic yet */ | | 687 | /* O_DIRECT not fully automatic yet */ |
688 | #ifdef HAS_ODIRECT | | 688 | #ifdef HAS_ODIRECT |
689 | if ((off & (DEV_BSIZE-1)) == 0 | | 689 | if ((off & (DEV_BSIZE-1)) == 0 |
690 | && ((intptr_t)bp->b_data&(DEV_BSIZE-1)) == 0 | | 690 | && ((intptr_t)bp->b_data&(DEV_BSIZE-1)) == 0 |
691 | && (bp->b_bcount & (DEV_BSIZE-1)) == 0) | | 691 | && (bp->b_bcount & (DEV_BSIZE-1)) == 0) |
692 | fd = rblk->rblk_dfd; | | 692 | fd = rblk->rblk_dfd; |
693 | else | | 693 | else |
694 | #endif | | 694 | #endif |
695 | op |= RUA_OP_SYNC; | | 695 | op |= RUA_OP_SYNC; |
696 | } | | 696 | } |
697 | } | | 697 | } |
698 | | | 698 | |
699 | rumpuser_mutex_enter(&rumpuser_aio_mtx); | | 699 | rumpuser_mutex_enter(&rumpuser_aio_mtx); |
700 | while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) { | | 700 | while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) { |
701 | rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx); | | 701 | rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx); |
702 | } | | 702 | } |
703 | | | 703 | |
704 | rua = &rumpuser_aios[rumpuser_aio_head]; | | 704 | rua = &rumpuser_aios[rumpuser_aio_head]; |
705 | KASSERT(rua->rua_bp == NULL); | | 705 | KASSERT(rua->rua_bp == NULL); |
706 | rua->rua_fd = fd; | | 706 | rua->rua_fd = fd; |
707 | rua->rua_data = bp->b_data; | | 707 | rua->rua_data = bp->b_data; |
708 | rua->rua_dlen = bp->b_bcount; | | 708 | rua->rua_dlen = bp->b_bcount; |
709 | rua->rua_off = off; | | 709 | rua->rua_off = off; |
710 | rua->rua_bp = bp; | | 710 | rua->rua_bp = bp; |
711 | rua->rua_op = op; | | 711 | rua->rua_op = op; |
712 | | | 712 | |
713 | /* insert into queue & signal */ | | 713 | /* insert into queue & signal */ |
714 | rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS; | | 714 | rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS; |
715 | rumpuser_cv_signal(&rumpuser_aio_cv); | | 715 | rumpuser_cv_signal(&rumpuser_aio_cv); |
716 | rumpuser_mutex_exit(&rumpuser_aio_mtx); | | 716 | rumpuser_mutex_exit(&rumpuser_aio_mtx); |
717 | } else { | | 717 | } else { |
718 | if (BUF_ISREAD(bp)) { | | 718 | if (BUF_ISREAD(bp)) { |
719 | rumpuser_read_bio(rblk->rblk_fd, bp->b_data, | | 719 | rumpuser_read_bio(rblk->rblk_fd, bp->b_data, |
720 | bp->b_bcount, off, rump_biodone, bp); | | 720 | bp->b_bcount, off, rump_biodone, bp); |
721 | } else { | | 721 | } else { |
722 | rumpuser_write_bio(rblk->rblk_fd, bp->b_data, | | 722 | rumpuser_write_bio(rblk->rblk_fd, bp->b_data, |
723 | bp->b_bcount, off, rump_biodone, bp); | | 723 | bp->b_bcount, off, rump_biodone, bp); |
724 | } | | 724 | } |
725 | if (BUF_ISWRITE(bp) && !async) | | 725 | if (BUF_ISWRITE(bp) && !async) |
726 | rumpuser_fsync(rblk->rblk_fd, &error); | | 726 | rumpuser_fsync(rblk->rblk_fd, &error); |
727 | } | | 727 | } |
728 | } | | 728 | } |
729 | | | 729 | |
730 | void | | 730 | void |
731 | rumpblk_strategy(struct buf *bp) | | 731 | rumpblk_strategy(struct buf *bp) |
732 | { | | 732 | { |
733 | | | 733 | |
734 | dostrategy(bp); | | 734 | dostrategy(bp); |
735 | } | | 735 | } |
736 | | | 736 | |
737 | /* | | 737 | /* |
738 | * Simple random number generator. This is private so that we can | | 738 | * Simple random number generator. This is private so that we can |
739 | * very repeatedly control which blocks will fail. | | 739 | * very repeatedly control which blocks will fail. |
740 | * | | 740 | * |
741 | * <mlelstv> pooka, rand() | | 741 | * <mlelstv> pooka, rand() |
742 | * <mlelstv> [paste] | | 742 | * <mlelstv> [paste] |
743 | */ | | 743 | */ |
744 | static unsigned | | 744 | static unsigned |
745 | gimmerand(void) | | 745 | gimmerand(void) |
746 | { | | 746 | { |
747 | | | 747 | |
748 | return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); | | 748 | return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); |
749 | } | | 749 | } |
750 | | | 750 | |
751 | /* | | 751 | /* |
752 | * Block device with very simple fault injection. Fails every | | 752 | * Block device with very simple fault injection. Fails every |
753 | * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env | | 753 | * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env |
754 | * variable RUMP_BLKFAIL. | | 754 | * variable RUMP_BLKFAIL. |
755 | */ | | 755 | */ |
756 | void | | 756 | void |
757 | rumpblk_strategy_fail(struct buf *bp) | | 757 | rumpblk_strategy_fail(struct buf *bp) |
758 | { | | 758 | { |
759 | | | 759 | |
760 | if (gimmerand() % BLKFAIL_MAX >= blkfail) { | | 760 | if (gimmerand() % BLKFAIL_MAX >= blkfail) { |
761 | dostrategy(bp); | | 761 | dostrategy(bp); |
762 | } else { | | 762 | } else { |
763 | printf("block fault injection: failing I/O on block %lld\n", | | 763 | printf("block fault injection: failing I/O on block %lld\n", |
764 | (long long)bp->b_blkno); | | 764 | (long long)bp->b_blkno); |
765 | bp->b_error = EIO; | | 765 | bp->b_error = EIO; |
766 | biodone(bp); | | 766 | biodone(bp); |
767 | } | | 767 | } |
768 | } | | 768 | } |