Mon Apr 20 16:12:28 2020 UTC ()
remove I/O defragmentation logic, pass requests straight to the
underlying block device without trying to coalesce them

it seems rarely useful, and it makes the handling logic unnecessarily complex -
ultimately it's the DomU operating system responsibility to issue optimal I/O

might also help with the ZFS problem reported on port-xen, and will surely
simplify eventual indirect segment support


(jdolecek)
diff -r1.82 -r1.83 src/sys/arch/xen/xen/xbdback_xenbus.c

cvs diff -r1.82 -r1.83 src/sys/arch/xen/xen/xbdback_xenbus.c (switch to unified diff)

--- src/sys/arch/xen/xen/xbdback_xenbus.c 2020/04/20 14:11:04 1.82
+++ src/sys/arch/xen/xen/xbdback_xenbus.c 2020/04/20 16:12:28 1.83
@@ -1,1875 +1,1509 @@ @@ -1,1875 +1,1509 @@
1/* $NetBSD: xbdback_xenbus.c,v 1.82 2020/04/20 14:11:04 jdolecek Exp $ */ 1/* $NetBSD: xbdback_xenbus.c,v 1.83 2020/04/20 16:12:28 jdolecek Exp $ */
2 2
3/* 3/*
4 * Copyright (c) 2006 Manuel Bouyer. 4 * Copyright (c) 2006 Manuel Bouyer.
5 * 5 *
6 * Redistribution and use in source and binary forms, with or without 6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions 7 * modification, are permitted provided that the following conditions
8 * are met: 8 * are met:
9 * 1. Redistributions of source code must retain the above copyright 9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer. 10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright 11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the 12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution. 13 * documentation and/or other materials provided with the distribution.
14 * 14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 * 25 *
26 */ 26 */
27 27
28#include <sys/cdefs.h> 28#include <sys/cdefs.h>
29__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.82 2020/04/20 14:11:04 jdolecek Exp $"); 29__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.83 2020/04/20 16:12:28 jdolecek Exp $");
30 30
31#include <sys/atomic.h> 31#include <sys/atomic.h>
32#include <sys/buf.h> 32#include <sys/buf.h>
33#include <sys/condvar.h> 33#include <sys/condvar.h>
34#include <sys/conf.h> 34#include <sys/conf.h>
35#include <sys/disk.h> 35#include <sys/disk.h>
36#include <sys/device.h> 36#include <sys/device.h>
37#include <sys/fcntl.h> 37#include <sys/fcntl.h>
38#include <sys/kauth.h> 38#include <sys/kauth.h>
39#include <sys/kernel.h> 39#include <sys/kernel.h>
40#include <sys/kmem.h> 40#include <sys/kmem.h>
41#include <sys/kthread.h> 41#include <sys/kthread.h>
42#include <sys/mutex.h> 42#include <sys/mutex.h>
43#include <sys/param.h> 43#include <sys/param.h>
44#include <sys/queue.h> 44#include <sys/queue.h>
45#include <sys/systm.h> 45#include <sys/systm.h>
46#include <sys/time.h> 46#include <sys/time.h>
47#include <sys/types.h> 47#include <sys/types.h>
48#include <sys/vnode.h> 48#include <sys/vnode.h>
49 49
50#include <xen/xen.h> 50#include <xen/xen.h>
51#include <xen/xen_shm.h> 51#include <xen/xen_shm.h>
52#include <xen/evtchn.h> 52#include <xen/evtchn.h>
53#include <xen/xenbus.h> 53#include <xen/xenbus.h>
54#include <xen/xenring.h> 54#include <xen/xenring.h>
55#include <xen/include/public/io/protocols.h> 55#include <xen/include/public/io/protocols.h>
56 56
57/* #define XENDEBUG_VBD */ 57/* #define XENDEBUG_VBD */
58#ifdef XENDEBUG_VBD 58#ifdef XENDEBUG_VBD
59#define XENPRINTF(x) printf x 59#define XENPRINTF(x) printf x
60#else 60#else
61#define XENPRINTF(x) 61#define XENPRINTF(x)
62#endif 62#endif
63 63
64#define BLKIF_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 64#define BLKIF_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
65 65
66/* 66/*
67 * Backend block device driver for Xen 67 * Backend block device driver for Xen
68 */ 68 */
69 69
70/* Values are expressed in 512-byte sectors */ 70/* Values are expressed in 512-byte sectors */
71#define VBD_BSIZE 512 71#define VBD_BSIZE 512
72#define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1) 72#define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)
73 73
74/* Need to alloc one extra page to account for possible mapping offset */ 74/* Need to alloc one extra page to account for possible mapping offset */
75#define VBD_VA_SIZE (MAXPHYS + PAGE_SIZE) 75#define VBD_VA_SIZE (MAXPHYS + PAGE_SIZE)
76 76
77struct xbdback_request; 
78struct xbdback_io; 77struct xbdback_io;
79struct xbdback_fragment; 
80struct xbdback_instance; 78struct xbdback_instance;
81 79
82/* 80/*
83 * status of a xbdback instance: 81 * status of a xbdback instance:
84 * WAITING: xbdback instance is connected, waiting for requests 82 * WAITING: xbdback instance is connected, waiting for requests
85 * RUN: xbdi thread must be woken up, I/Os have to be processed 83 * RUN: xbdi thread must be woken up, I/Os have to be processed
86 * DISCONNECTING: the instance is closing, no more I/Os can be scheduled 84 * DISCONNECTING: the instance is closing, no more I/Os can be scheduled
87 * DISCONNECTED: no I/Os, no ring, the thread should terminate. 85 * DISCONNECTED: no I/Os, no ring, the thread should terminate.
88 */ 86 */
89typedef enum {WAITING, RUN, DISCONNECTING, DISCONNECTED} xbdback_state_t; 87typedef enum {WAITING, RUN, DISCONNECTING, DISCONNECTED} xbdback_state_t;
90 88
91/* 89/*
92 * Each xbdback instance is managed by a single thread that handles all 90 * Each xbdback instance is managed by a single thread that handles all
93 * the I/O processing. As there are a variety of conditions that can block, 91 * the I/O processing. As there are a variety of conditions that can block,
94 * everything will be done in a sort of continuation-passing style. 92 * everything will be done in a sort of continuation-passing style.
95 * 93 *
96 * When the execution has to block to delay processing, for example to 94 * When the execution has to block to delay processing, for example to
97 * allow system to recover because of memory shortage (via shared memory 95 * allow system to recover because of memory shortage (via shared memory
98 * callback), the return value of a continuation can be set to NULL. In that 96 * callback), the return value of a continuation can be set to NULL. In that
99 * case, the thread will go back to sleeping and wait for the proper 97 * case, the thread will go back to sleeping and wait for the proper
100 * condition before it starts processing requests again from where it left. 98 * condition before it starts processing requests again from where it left.
101 * Continuation state is "stored" in the xbdback instance (xbdi_cont and 99 * Continuation state is "stored" in the xbdback instance (xbdi_cont),
102 * xbdi_cont_aux), and should only be manipulated by the instance thread. 100 * and should only be manipulated by the instance thread.
103 * 101 *
104 * As xbdback(4) has to handle different sort of asynchronous events (Xen 102 * As xbdback(4) has to handle different sort of asynchronous events (Xen
105 * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock 103 * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock
106 * mutex is used to protect specific elements of the xbdback instance from 104 * mutex is used to protect specific elements of the xbdback instance from
107 * concurrent access: thread status and ring access (when pushing responses). 105 * concurrent access: thread status and ring access (when pushing responses).
108 *  106 *
109 * Here's how the call graph is supposed to be for a single I/O: 107 * Here's how the call graph is supposed to be for a single I/O:
110 * 108 *
111 * xbdback_co_main() 109 * xbdback_co_main()
112 * | 110 * | --> xbdback_co_cache_flush()
113 * | --> xbdback_co_cache_doflush() or NULL 111 * | | |
114 * | | 112 * | | -> xbdback_co_cache_doflush() or NULL
115 * | - xbdback_co_cache_flush2() <- xbdback_co_do_io() <- 113 * | | |
116 * | | | 114 * | | -> xbdback_co_do_io()
117 * | |-> xbdback_co_cache_flush() -> xbdback_co_map_io()- 
118 * xbdback_co_main_loop()-| 115 * xbdback_co_main_loop()-|
119 * | |-> xbdback_co_main_done() ---> xbdback_co_map_io()- 116 * | |-> xbdback_co_main_done2() or NULL
120 * | | | 
121 * | -- xbdback_co_main_done2() <-- xbdback_co_do_io() <- 
122 * | | 117 * | |
123 * | --> xbdback_co_main() or NULL 118 * | --> xbdback_co_main_incr() -> xbdback_co_main_loop()
124 * | 119 * |
125 * xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop() 120 * xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
126 * | 121 * |
127 * xbdback_co_io_gotreq()--+--> xbdback_co_map_io() --- 122 * xbdback_co_io_gotio() -> xbdback_map_shm()
128 * | | | 123 * | |
129 * -> xbdback_co_io_loop()----| <- xbdback_co_do_io() <-- 124 * | xbdback_co_main_incr() -> xbdback_co_main_loop()
130 * | | | | 125 * |
131 * | | | |----------> xbdback_co_io_gotio() 126 * xbdback_co_do_io()
132 * | | | | 
133 * | | xbdback_co_main_incr() | 
134 * | | | | 
135 * | | xbdback_co_main_loop() | 
136 * | | | 
137 * | xbdback_co_io_gotio2() <-----------| 
138 * | | | 
139 * | | |----------> xbdback_co_io_gotfrag() 
140 * | | | 
141 * -- xbdback_co_io_gotfrag2() <---------| 
142 * | 127 * |
143 * xbdback_co_main_incr() -> xbdback_co_main_loop() 128 * xbdback_co_main_incr() -> xbdback_co_main_loop()
144 */ 129 */
145typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *); 130typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
146 131
147enum xbdi_proto { 132enum xbdi_proto {
148 XBDIP_NATIVE, 133 XBDIP_NATIVE,
149 XBDIP_32, 134 XBDIP_32,
150 XBDIP_64 135 XBDIP_64
151}; 136};
152 137
153struct xbdback_va { 138struct xbdback_va {
154 SLIST_ENTRY(xbdback_va) xv_next; 139 SLIST_ENTRY(xbdback_va) xv_next;
155 vaddr_t xv_vaddr; 140 vaddr_t xv_vaddr;
156}; 141};
157 142
158/* we keep the xbdback instances in a linked list */ 143/* we keep the xbdback instances in a linked list */
159struct xbdback_instance { 144struct xbdback_instance {
160 SLIST_ENTRY(xbdback_instance) next; 145 SLIST_ENTRY(xbdback_instance) next;
161 struct xenbus_device *xbdi_xbusd; /* our xenstore entry */ 146 struct xenbus_device *xbdi_xbusd; /* our xenstore entry */
162 struct xenbus_watch xbdi_watch; /* to watch our store */ 147 struct xenbus_watch xbdi_watch; /* to watch our store */
163 domid_t xbdi_domid; /* attached to this domain */ 148 domid_t xbdi_domid; /* attached to this domain */
164 uint32_t xbdi_handle; /* domain-specific handle */ 149 uint32_t xbdi_handle; /* domain-specific handle */
165 char xbdi_name[16]; /* name of this instance */ 150 char xbdi_name[16]; /* name of this instance */
166 /* mutex that protects concurrent access to the xbdback instance */ 151 /* mutex that protects concurrent access to the xbdback instance */
167 kmutex_t xbdi_lock; 152 kmutex_t xbdi_lock;
168 kcondvar_t xbdi_cv; /* wait channel for thread work */ 153 kcondvar_t xbdi_cv; /* wait channel for thread work */
169 xbdback_state_t xbdi_status; /* thread's status */ 154 xbdback_state_t xbdi_status; /* thread's status */
170 /* KVA for mapping transfers */ 155 /* KVA for mapping transfers */
171 struct xbdback_va xbdi_va[BLKIF_RING_SIZE]; 156 struct xbdback_va xbdi_va[BLKIF_RING_SIZE];
172 SLIST_HEAD(, xbdback_va) xbdi_va_free; 157 SLIST_HEAD(, xbdback_va) xbdi_va_free;
173 /* backing device parameters */ 158 /* backing device parameters */
174 dev_t xbdi_dev; 159 dev_t xbdi_dev;
175 const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */ 160 const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */
176 struct vnode *xbdi_vp; 161 struct vnode *xbdi_vp;
177 uint64_t xbdi_size; 162 uint64_t xbdi_size;
178 bool xbdi_ro; /* is device read-only ? */ 163 bool xbdi_ro; /* is device read-only ? */
179 /* parameters for the communication */ 164 /* parameters for the communication */
180 unsigned int xbdi_evtchn; 165 unsigned int xbdi_evtchn;
181 struct intrhand *xbdi_ih; 166 struct intrhand *xbdi_ih;
182 /* private parameters for communication */ 167 /* private parameters for communication */
183 blkif_back_ring_proto_t xbdi_ring; 168 blkif_back_ring_proto_t xbdi_ring;
184 enum xbdi_proto xbdi_proto; 169 enum xbdi_proto xbdi_proto;
185 grant_handle_t xbdi_ring_handle; /* to unmap the ring */ 170 grant_handle_t xbdi_ring_handle; /* to unmap the ring */
186 vaddr_t xbdi_ring_va; /* to unmap the ring */ 171 vaddr_t xbdi_ring_va; /* to unmap the ring */
187 /* disconnection must be postponed until all I/O is done */ 172 /* disconnection must be postponed until all I/O is done */
188 int xbdi_refcnt; 173 int xbdi_refcnt;
189 /*  174 /*
190 * State for I/O processing/coalescing follows; this has to 175 * State for I/O processing/coalescing follows; this has to
191 * live here instead of on the stack because of the 176 * live here instead of on the stack because of the
192 * continuation-ness (see above). 177 * continuation-ness (see above).
193 */ 178 */
194 RING_IDX xbdi_req_prod; /* limit on request indices */ 179 RING_IDX xbdi_req_prod; /* limit on request indices */
195 xbdback_cont_t xbdi_cont, xbdi_cont_aux; 180 xbdback_cont_t xbdi_cont;
196 /* _request state: track requests fetched from ring */ 181 /* _request state: track requests fetched from ring */
197 struct xbdback_request *xbdi_req; /* if NULL, ignore following */ 182 struct xbdback_request *xbdi_req; /* if NULL, ignore following */
198 blkif_request_t xbdi_xen_req; 183 blkif_request_t xbdi_xen_req;
199 int xbdi_segno; 
200 /* _io state: I/O associated to this instance */ 184 /* _io state: I/O associated to this instance */
201 struct xbdback_io *xbdi_io; /* if NULL, ignore next field */ 185 struct xbdback_io *xbdi_io;
202 daddr_t xbdi_next_sector; 
203 uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */ 
204 uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */ 
205 grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */ 
206 /* other state */ 186 /* other state */
207 int xbdi_same_page; /* are we merging two segments on the same page? */ 187 int xbdi_same_page; /* are we merging two segments on the same page? */
208 uint xbdi_pendingreqs; /* number of I/O in fly */ 188 uint xbdi_pendingreqs; /* number of I/O in fly */
209 struct timeval xbdi_lasterr_time; /* error time tracking */ 189 struct timeval xbdi_lasterr_time; /* error time tracking */
210#ifdef DEBUG 190#ifdef DEBUG
211 struct timeval xbdi_lastfragio_time; /* fragmented I/O tracking */ 191 struct timeval xbdi_lastfragio_time; /* fragmented I/O tracking */
212#endif 192#endif
213}; 193};
214/* Manipulation of the above reference count. */ 194/* Manipulation of the above reference count. */
215#define xbdi_get(xbdip) atomic_inc_uint(&(xbdip)->xbdi_refcnt) 195#define xbdi_get(xbdip) atomic_inc_uint(&(xbdip)->xbdi_refcnt)
216#define xbdi_put(xbdip) \ 196#define xbdi_put(xbdip) \
217do { \ 197do { \
218 if (atomic_dec_uint_nv(&(xbdip)->xbdi_refcnt) == 0) \ 198 if (atomic_dec_uint_nv(&(xbdip)->xbdi_refcnt) == 0) \
219 xbdback_finish_disconnect(xbdip); \ 199 xbdback_finish_disconnect(xbdip); \
220} while (/* CONSTCOND */ 0) 200} while (/* CONSTCOND */ 0)
221 201
222static SLIST_HEAD(, xbdback_instance) xbdback_instances; 202static SLIST_HEAD(, xbdback_instance) xbdback_instances;
223static kmutex_t xbdback_lock; 203static kmutex_t xbdback_lock;
224 204
225/* 205/*
226 * For each request from a guest, a xbdback_request is allocated from 
227 * a pool. This will describe the request until completion. The 
228 * request may require multiple IO operations to perform, so the 
229 * per-IO information is not stored here. 
230 */ 
231struct xbdback_request { 
232 struct xbdback_instance *rq_xbdi; /* our xbd instance */ 
233 uint64_t rq_id; 
234 int rq_iocount; /* reference count; or, number of outstanding I/O's */ 
235 int rq_ioerrs; 
236 uint8_t rq_operation; 
237}; 
238 
239/* 
240 * For each I/O operation associated with one of those requests, an 206 * For each I/O operation associated with one of those requests, an
241 * xbdback_io is allocated from a pool. It may correspond to multiple 207 * xbdback_io is allocated from a pool. It may correspond to multiple
242 * Xen disk requests, or parts of them, if several arrive at once that 208 * Xen disk requests, or parts of them, if several arrive at once that
243 * can be coalesced. 209 * can be coalesced.
244 */ 210 */
245struct xbdback_io { 211struct xbdback_io {
246 /* The instance pointer is duplicated for convenience. */ 212 /* The instance pointer is duplicated for convenience. */
247 struct xbdback_instance *xio_xbdi; /* our xbd instance */ 213 struct xbdback_instance *xio_xbdi; /* our xbd instance */
248 uint8_t xio_operation; 214 uint8_t xio_operation;
 215 uint64_t xio_id;
249 union { 216 union {
250 struct { 217 struct {
251 struct buf xio_buf; /* our I/O */ 218 struct buf xio_buf; /* our I/O */
252 /* xbd requests involved */ 
253 SLIST_HEAD(, xbdback_fragment) xio_rq; 
254 /* the virtual address to map the request at */ 219 /* the virtual address to map the request at */
255 vaddr_t xio_vaddr; 220 vaddr_t xio_vaddr;
256 struct xbdback_va *xio_xv; 221 struct xbdback_va *xio_xv;
 222 vaddr_t xio_start_offset; /* I/O start offset */
257 /* grants to map */ 223 /* grants to map */
258 grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; 224 grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST];
259 /* grants release */ 225 /* grants release */
260 grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST]; 226 grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];
261 uint16_t xio_nrma; /* number of guest pages */ 227 uint16_t xio_nrma; /* number of guest pages */
262 uint16_t xio_mapped; /* == 1: grants are mapped */ 
263 } xio_rw; 228 } xio_rw;
264 uint64_t xio_flush_id; 
265 } u; 229 } u;
266}; 230};
267#define xio_buf u.xio_rw.xio_buf 231#define xio_buf u.xio_rw.xio_buf
268#define xio_rq u.xio_rw.xio_rq 
269#define xio_vaddr u.xio_rw.xio_vaddr 232#define xio_vaddr u.xio_rw.xio_vaddr
 233#define xio_start_offset u.xio_rw.xio_start_offset
270#define xio_xv u.xio_rw.xio_xv 234#define xio_xv u.xio_rw.xio_xv
271#define xio_gref u.xio_rw.xio_gref 235#define xio_gref u.xio_rw.xio_gref
272#define xio_gh u.xio_rw.xio_gh 236#define xio_gh u.xio_rw.xio_gh
273#define xio_nrma u.xio_rw.xio_nrma 237#define xio_nrma u.xio_rw.xio_nrma
274#define xio_mapped u.xio_rw.xio_mapped 
275 
276#define xio_flush_id u.xio_flush_id 
277 
278/* 
279 * Rather than having the xbdback_io keep an array of the 
280 * xbdback_requests involved, since the actual number will probably be 
281 * small but might be as large as BLKIF_RING_SIZE, use a list. This 
282 * would be threaded through xbdback_request, but one of them might be 
283 * part of multiple I/O's, alas. 
284 */ 
285struct xbdback_fragment { 
286 struct xbdback_request *car; 
287 SLIST_ENTRY(xbdback_fragment) cdr; 
288}; 
289 238
290/* 239/*
291 * Pools to manage the chain of block requests and I/Os fragments 240 * Pools to manage the chain of block requests and I/Os fragments
292 * submitted by frontend. 241 * submitted by frontend.
293 */ 242 */
294/* XXXSMP */ 243static struct pool_cache xbdback_io_pool;
295static struct xbdback_pool { 
296 struct pool_cache pc; 
297 struct timeval last_warning; 
298} xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool; 
299 244
300/* Interval between reports of I/O errors from frontend */ 245/* Interval between reports of I/O errors from frontend */
301static const struct timeval xbdback_err_intvl = { 1, 0 }; 246static const struct timeval xbdback_err_intvl = { 1, 0 };
302 247
303#ifdef DEBUG 248#ifdef DEBUG
304static const struct timeval xbdback_fragio_intvl = { 60, 0 }; 249static const struct timeval xbdback_fragio_intvl = { 60, 0 };
305#endif 250#endif
306 void xbdbackattach(int); 251 void xbdbackattach(int);
307static int xbdback_xenbus_create(struct xenbus_device *); 252static int xbdback_xenbus_create(struct xenbus_device *);
308static int xbdback_xenbus_destroy(void *); 253static int xbdback_xenbus_destroy(void *);
309static void xbdback_frontend_changed(void *, XenbusState); 254static void xbdback_frontend_changed(void *, XenbusState);
310static void xbdback_backend_changed(struct xenbus_watch *, 255static void xbdback_backend_changed(struct xenbus_watch *,
311 const char **, unsigned int); 256 const char **, unsigned int);
312static int xbdback_evthandler(void *); 257static int xbdback_evthandler(void *);
313 258
314static int xbdback_connect(struct xbdback_instance *); 259static int xbdback_connect(struct xbdback_instance *);
315static void xbdback_disconnect(struct xbdback_instance *); 260static void xbdback_disconnect(struct xbdback_instance *);
316static void xbdback_finish_disconnect(struct xbdback_instance *); 261static void xbdback_finish_disconnect(struct xbdback_instance *);
317 262
318static bool xbdif_lookup(domid_t, uint32_t); 263static bool xbdif_lookup(domid_t, uint32_t);
319 264
320static void *xbdback_co_main(struct xbdback_instance *, void *); 265static void *xbdback_co_main(struct xbdback_instance *, void *);
321static void *xbdback_co_main_loop(struct xbdback_instance *, void *); 266static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
322static void *xbdback_co_main_incr(struct xbdback_instance *, void *); 267static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
323static void *xbdback_co_main_done(struct xbdback_instance *, void *); 
324static void *xbdback_co_main_done2(struct xbdback_instance *, void *); 268static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
325 269
326static void *xbdback_co_cache_flush(struct xbdback_instance *, void *); 270static void *xbdback_co_cache_flush(struct xbdback_instance *, void *);
327static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *); 
328static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *); 271static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *);
329 272
330static void *xbdback_co_io(struct xbdback_instance *, void *); 273static void *xbdback_co_io(struct xbdback_instance *, void *);
331static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *); 
332static void *xbdback_co_io_loop(struct xbdback_instance *, void *); 
333static void *xbdback_co_io_gotio(struct xbdback_instance *, void *); 274static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
334static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *); 
335static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *); 
336static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *); 
337 275
338static void *xbdback_co_map_io(struct xbdback_instance *, void *); 
339static void *xbdback_co_do_io(struct xbdback_instance *, void *); 276static void *xbdback_co_do_io(struct xbdback_instance *, void *);
340 277
341static void xbdback_io_error(struct xbdback_io *, int); 278static void xbdback_io_error(struct xbdback_io *, int);
342static void xbdback_iodone(struct buf *); 279static void xbdback_iodone(struct buf *);
343static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int); 280static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int);
344 281
345static void *xbdback_map_shm(struct xbdback_io *); 282static void *xbdback_map_shm(struct xbdback_io *);
346static void xbdback_unmap_shm(struct xbdback_io *); 283static void xbdback_unmap_shm(struct xbdback_io *);
347 284
348static void *xbdback_pool_get(struct xbdback_pool *, 285static void *xbdback_pool_get(struct pool_cache *,
349 struct xbdback_instance *); 286 struct xbdback_instance *);
350static void xbdback_pool_put(struct xbdback_pool *, void *); 287static void xbdback_pool_put(struct pool_cache *, void *);
351static void xbdback_thread(void *); 288static void xbdback_thread(void *);
352static void xbdback_wakeup_thread(struct xbdback_instance *); 289static void xbdback_wakeup_thread(struct xbdback_instance *);
353static void xbdback_trampoline(struct xbdback_instance *, void *); 290static void xbdback_trampoline(struct xbdback_instance *, void *);
354 291
355static struct xenbus_backend_driver xbd_backend_driver = { 292static struct xenbus_backend_driver xbd_backend_driver = {
356 .xbakd_create = xbdback_xenbus_create, 293 .xbakd_create = xbdback_xenbus_create,
357 .xbakd_type = "vbd" 294 .xbakd_type = "vbd"
358}; 295};
359 296
360void 297void
361xbdbackattach(int n) 298xbdbackattach(int n)
362{ 299{
363 XENPRINTF(("xbdbackattach\n")); 300 XENPRINTF(("xbdbackattach\n"));
364 301
365 /* 302 /*
366 * initialize the backend driver, register the control message handler 303 * initialize the backend driver, register the control message handler
367 * and send driver up message. 304 * and send driver up message.
368 */ 305 */
369 SLIST_INIT(&xbdback_instances); 306 SLIST_INIT(&xbdback_instances);
370 mutex_init(&xbdback_lock, MUTEX_DEFAULT, IPL_NONE); 307 mutex_init(&xbdback_lock, MUTEX_DEFAULT, IPL_NONE);
371 308
372 pool_cache_bootstrap(&xbdback_request_pool.pc, 309 pool_cache_bootstrap(&xbdback_io_pool,
373 sizeof(struct xbdback_request), 0, 0, 0, "xbbrp", NULL, 
374 IPL_SOFTBIO, NULL, NULL, NULL); 
375 pool_cache_bootstrap(&xbdback_io_pool.pc, 
376 sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL, 310 sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL,
377 IPL_SOFTBIO, NULL, NULL, NULL); 311 IPL_SOFTBIO, NULL, NULL, NULL);
378 pool_cache_bootstrap(&xbdback_fragment_pool.pc, 
379 sizeof(struct xbdback_fragment), 0, 0, 0, "xbbfp", NULL, 
380 IPL_SOFTBIO, NULL, NULL, NULL); 
381 312
382 /* we allocate enough to handle a whole ring at once */ 313 /* we allocate enough to handle a whole ring at once */
383 pool_prime(&xbdback_request_pool.pc.pc_pool, BLKIF_RING_SIZE); 314 pool_prime(&xbdback_io_pool.pc_pool, BLKIF_RING_SIZE);
384 pool_prime(&xbdback_io_pool.pc.pc_pool, BLKIF_RING_SIZE); 
385 pool_prime(&xbdback_fragment_pool.pc.pc_pool, 
386 BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE); 
387 315
388 xenbus_backend_register(&xbd_backend_driver); 316 xenbus_backend_register(&xbd_backend_driver);
389} 317}
390 318
391static int 319static int
392xbdback_xenbus_create(struct xenbus_device *xbusd) 320xbdback_xenbus_create(struct xenbus_device *xbusd)
393{ 321{
394 struct xbdback_instance *xbdi; 322 struct xbdback_instance *xbdi;
395 long domid, handle; 323 long domid, handle;
396 int error, i; 324 int error, i;
397 char *ep; 325 char *ep;
398 326
399 if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path, 327 if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path,
400 "frontend-id", &domid, 10)) != 0) { 328 "frontend-id", &domid, 10)) != 0) {
401 aprint_error("xbdback: can't read %s/frontend-id: %d\n", 329 aprint_error("xbdback: can't read %s/frontend-id: %d\n",
402 xbusd->xbusd_path, error); 330 xbusd->xbusd_path, error);
403 return error; 331 return error;
404 } 332 }
405 333
406 /* 334 /*
407 * get handle: this is the last component of the path; which is 335 * get handle: this is the last component of the path; which is
408 * a decimal number. $path/dev contains the device name, which is not 336 * a decimal number. $path/dev contains the device name, which is not
409 * appropriate. 337 * appropriate.
410 */ 338 */
411 for (i = strlen(xbusd->xbusd_path); i > 0; i--) { 339 for (i = strlen(xbusd->xbusd_path); i > 0; i--) {
412 if (xbusd->xbusd_path[i] == '/') 340 if (xbusd->xbusd_path[i] == '/')
413 break; 341 break;
414 } 342 }
415 if (i == 0) { 343 if (i == 0) {
416 aprint_error("xbdback: can't parse %s\n", 344 aprint_error("xbdback: can't parse %s\n",
417 xbusd->xbusd_path); 345 xbusd->xbusd_path);
418 return EFTYPE; 346 return EFTYPE;
419 } 347 }
420 handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10); 348 handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10);
421 if (*ep != '\0') { 349 if (*ep != '\0') {
422 aprint_error("xbdback: can't parse %s\n", 350 aprint_error("xbdback: can't parse %s\n",
423 xbusd->xbusd_path); 351 xbusd->xbusd_path);
424 return EFTYPE; 352 return EFTYPE;
425 } 353 }
426  354
427 if (xbdif_lookup(domid, handle)) { 355 if (xbdif_lookup(domid, handle)) {
428 return EEXIST; 356 return EEXIST;
429 } 357 }
430 xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP); 358 xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP);
431 359
432 xbdi->xbdi_domid = domid; 360 xbdi->xbdi_domid = domid;
433 xbdi->xbdi_handle = handle; 361 xbdi->xbdi_handle = handle;
434 snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d", 362 snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d",
435 xbdi->xbdi_domid, xbdi->xbdi_handle); 363 xbdi->xbdi_domid, xbdi->xbdi_handle);
436 364
437 /* initialize status and reference counter */ 365 /* initialize status and reference counter */
438 xbdi->xbdi_status = DISCONNECTED; 366 xbdi->xbdi_status = DISCONNECTED;
439 xbdi_get(xbdi); 367 xbdi_get(xbdi);
440 368
441 mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO); 369 mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO);
442 cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name); 370 cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name);
443 mutex_enter(&xbdback_lock); 371 mutex_enter(&xbdback_lock);
444 SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next); 372 SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
445 mutex_exit(&xbdback_lock); 373 mutex_exit(&xbdback_lock);
446 374
447 xbusd->xbusd_u.b.b_cookie = xbdi;  375 xbusd->xbusd_u.b.b_cookie = xbdi;
448 xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy; 376 xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy;
449 xbusd->xbusd_otherend_changed = xbdback_frontend_changed; 377 xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
450 xbdi->xbdi_xbusd = xbusd; 378 xbdi->xbdi_xbusd = xbusd;
451 379
452 SLIST_INIT(&xbdi->xbdi_va_free); 380 SLIST_INIT(&xbdi->xbdi_va_free);
453 for (i = 0; i < BLKIF_RING_SIZE; i++) { 381 for (i = 0; i < BLKIF_RING_SIZE; i++) {
454 xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map, 382 xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map,
455 VBD_VA_SIZE, 0, UVM_KMF_VAONLY|UVM_KMF_WAITVA); 383 VBD_VA_SIZE, 0, UVM_KMF_VAONLY|UVM_KMF_WAITVA);
456 SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, &xbdi->xbdi_va[i], 384 SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, &xbdi->xbdi_va[i],
457 xv_next); 385 xv_next);
458 } 386 }
459 387
460 error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device", 388 error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device",
461 &xbdi->xbdi_watch, xbdback_backend_changed); 389 &xbdi->xbdi_watch, xbdback_backend_changed);
462 if (error) { 390 if (error) {
463 printf("failed to watch on %s/physical-device: %d\n", 391 printf("failed to watch on %s/physical-device: %d\n",
464 xbusd->xbusd_path, error); 392 xbusd->xbusd_path, error);
465 goto fail; 393 goto fail;
466 } 394 }
467 xbdi->xbdi_watch.xbw_dev = xbusd; 395 xbdi->xbdi_watch.xbw_dev = xbusd;
468 error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait); 396 error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
469 if (error) { 397 if (error) {
470 printf("failed to switch state on %s: %d\n", 398 printf("failed to switch state on %s: %d\n",
471 xbusd->xbusd_path, error); 399 xbusd->xbusd_path, error);
472 goto fail2; 400 goto fail2;
473 } 401 }
474 return 0; 402 return 0;
475fail2: 403fail2:
476 unregister_xenbus_watch(&xbdi->xbdi_watch); 404 unregister_xenbus_watch(&xbdi->xbdi_watch);
477fail: 405fail:
478 kmem_free(xbdi, sizeof(*xbdi)); 406 kmem_free(xbdi, sizeof(*xbdi));
479 return error; 407 return error;
480} 408}
481 409
482static int 410static int
483xbdback_xenbus_destroy(void *arg) 411xbdback_xenbus_destroy(void *arg)
484{ 412{
485 struct xbdback_instance *xbdi = arg; 413 struct xbdback_instance *xbdi = arg;
486 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 414 struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
487 struct gnttab_unmap_grant_ref ungrop; 415 struct gnttab_unmap_grant_ref ungrop;
488 int err; 416 int err;
489 417
490 XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status)); 418 XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status));
491 419
492 xbdback_disconnect(xbdi); 420 xbdback_disconnect(xbdi);
493 421
494 /* unregister watch */ 422 /* unregister watch */
495 if (xbdi->xbdi_watch.node) 423 if (xbdi->xbdi_watch.node)
496 xenbus_unwatch_path(&xbdi->xbdi_watch); 424 xenbus_unwatch_path(&xbdi->xbdi_watch);
497 425
498 /* unmap ring */ 426 /* unmap ring */
499 if (xbdi->xbdi_ring_va != 0) { 427 if (xbdi->xbdi_ring_va != 0) {
500 ungrop.host_addr = xbdi->xbdi_ring_va; 428 ungrop.host_addr = xbdi->xbdi_ring_va;
501 ungrop.handle = xbdi->xbdi_ring_handle; 429 ungrop.handle = xbdi->xbdi_ring_handle;
502 ungrop.dev_bus_addr = 0; 430 ungrop.dev_bus_addr = 0;
503 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 431 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
504 &ungrop, 1); 432 &ungrop, 1);
505 if (err) 433 if (err)
506 printf("xbdback %s: unmap_grant_ref failed: %d\n", 434 printf("xbdback %s: unmap_grant_ref failed: %d\n",
507 xbusd->xbusd_otherend, err); 435 xbusd->xbusd_otherend, err);
508 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, 436 uvm_km_free(kernel_map, xbdi->xbdi_ring_va,
509 PAGE_SIZE, UVM_KMF_VAONLY); 437 PAGE_SIZE, UVM_KMF_VAONLY);
510 } 438 }
511 /* close device */ 439 /* close device */
512 if (xbdi->xbdi_size) { 440 if (xbdi->xbdi_size) {
513 const char *name; 441 const char *name;
514 struct dkwedge_info wi; 442 struct dkwedge_info wi;
515 if (getdiskinfo(xbdi->xbdi_vp, &wi) == 0) 443 if (getdiskinfo(xbdi->xbdi_vp, &wi) == 0)
516 name = wi.dkw_devname; 444 name = wi.dkw_devname;
517 else 445 else
518 name = "*unknown*"; 446 name = "*unknown*";
519 printf("xbd backend: detach device %s for domain %d\n", 447 printf("xbd backend: detach device %s for domain %d\n",
520 name, xbdi->xbdi_domid); 448 name, xbdi->xbdi_domid);
521 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 449 vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
522 } 450 }
523 mutex_enter(&xbdback_lock); 451 mutex_enter(&xbdback_lock);
524 SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next); 452 SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
525 mutex_exit(&xbdback_lock); 453 mutex_exit(&xbdback_lock);
526 454
527 for (int i = 0; i < BLKIF_RING_SIZE; i++) { 455 for (int i = 0; i < BLKIF_RING_SIZE; i++) {
528 if (xbdi->xbdi_va[i].xv_vaddr != 0) { 456 if (xbdi->xbdi_va[i].xv_vaddr != 0) {
529 uvm_km_free(kernel_map, xbdi->xbdi_va[i].xv_vaddr, 457 uvm_km_free(kernel_map, xbdi->xbdi_va[i].xv_vaddr,
530 VBD_VA_SIZE, UVM_KMF_VAONLY); 458 VBD_VA_SIZE, UVM_KMF_VAONLY);
531 xbdi->xbdi_va[i].xv_vaddr = 0; 459 xbdi->xbdi_va[i].xv_vaddr = 0;
532 } 460 }
533 } 461 }
534 462
535 mutex_destroy(&xbdi->xbdi_lock); 463 mutex_destroy(&xbdi->xbdi_lock);
536 cv_destroy(&xbdi->xbdi_cv); 464 cv_destroy(&xbdi->xbdi_cv);
537 kmem_free(xbdi, sizeof(*xbdi)); 465 kmem_free(xbdi, sizeof(*xbdi));
538 return 0; 466 return 0;
539} 467}
540 468
541static int 469static int
542xbdback_connect(struct xbdback_instance *xbdi) 470xbdback_connect(struct xbdback_instance *xbdi)
543{ 471{
544 int err; 472 int err;
545 struct gnttab_map_grant_ref grop; 473 struct gnttab_map_grant_ref grop;
546 struct gnttab_unmap_grant_ref ungrop; 474 struct gnttab_unmap_grant_ref ungrop;
547 evtchn_op_t evop; 475 evtchn_op_t evop;
548 u_long ring_ref, revtchn; 476 u_long ring_ref, revtchn;
549 char xsproto[32]; 477 char xsproto[32];
550 const char *proto; 478 const char *proto;
551 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 479 struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
552 480
553 XENPRINTF(("xbdback %s: connect\n", xbusd->xbusd_path)); 481 XENPRINTF(("xbdback %s: connect\n", xbusd->xbusd_path));
554 /* read comunication informations */ 482 /* read comunication informations */
555 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend, 483 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
556 "ring-ref", &ring_ref, 10); 484 "ring-ref", &ring_ref, 10);
557 if (err) { 485 if (err) {
558 xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref", 486 xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref",
559 xbusd->xbusd_otherend); 487 xbusd->xbusd_otherend);
560 return -1; 488 return -1;
561 } 489 }
562 XENPRINTF(("xbdback %s: connect ring-ref %lu\n", xbusd->xbusd_path, ring_ref)); 490 XENPRINTF(("xbdback %s: connect ring-ref %lu\n", xbusd->xbusd_path, ring_ref));
563 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend, 491 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
564 "event-channel", &revtchn, 10); 492 "event-channel", &revtchn, 10);
565 if (err) { 493 if (err) {
566 xenbus_dev_fatal(xbusd, err, "reading %s/event-channel", 494 xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
567 xbusd->xbusd_otherend); 495 xbusd->xbusd_otherend);
568 return -1; 496 return -1;
569 } 497 }
570 XENPRINTF(("xbdback %s: connect revtchn %lu\n", xbusd->xbusd_path, revtchn)); 498 XENPRINTF(("xbdback %s: connect revtchn %lu\n", xbusd->xbusd_path, revtchn));
571 err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol", 499 err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol",
572 xsproto, sizeof(xsproto)); 500 xsproto, sizeof(xsproto));
573 if (err) { 501 if (err) {
574 xbdi->xbdi_proto = XBDIP_NATIVE; 502 xbdi->xbdi_proto = XBDIP_NATIVE;
575 proto = "unspecified"; 503 proto = "unspecified";
576 XENPRINTF(("xbdback %s: connect no xsproto\n", xbusd->xbusd_path)); 504 XENPRINTF(("xbdback %s: connect no xsproto\n", xbusd->xbusd_path));
577 } else { 505 } else {
578 XENPRINTF(("xbdback %s: connect xsproto %s\n", xbusd->xbusd_path, xsproto)); 506 XENPRINTF(("xbdback %s: connect xsproto %s\n", xbusd->xbusd_path, xsproto));
579 if (strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) { 507 if (strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) {
580 xbdi->xbdi_proto = XBDIP_NATIVE; 508 xbdi->xbdi_proto = XBDIP_NATIVE;
581 proto = XEN_IO_PROTO_ABI_NATIVE; 509 proto = XEN_IO_PROTO_ABI_NATIVE;
582 } else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) { 510 } else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) {
583 xbdi->xbdi_proto = XBDIP_32; 511 xbdi->xbdi_proto = XBDIP_32;
584 proto = XEN_IO_PROTO_ABI_X86_32; 512 proto = XEN_IO_PROTO_ABI_X86_32;
585 } else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) { 513 } else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) {
586 xbdi->xbdi_proto = XBDIP_64; 514 xbdi->xbdi_proto = XBDIP_64;
587 proto = XEN_IO_PROTO_ABI_X86_64; 515 proto = XEN_IO_PROTO_ABI_X86_64;
588 } else { 516 } else {
589 aprint_error("xbd domain %d: unknown proto %s\n", 517 aprint_error("xbd domain %d: unknown proto %s\n",
590 xbdi->xbdi_domid, xsproto); 518 xbdi->xbdi_domid, xsproto);
591 return -1; 519 return -1;
592 } 520 }
593 } 521 }
594 522
595 /* allocate VA space and map rings */ 523 /* allocate VA space and map rings */
596 xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 524 xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
597 UVM_KMF_VAONLY); 525 UVM_KMF_VAONLY);
598 if (xbdi->xbdi_ring_va == 0) { 526 if (xbdi->xbdi_ring_va == 0) {
599 xenbus_dev_fatal(xbusd, ENOMEM, 527 xenbus_dev_fatal(xbusd, ENOMEM,
600 "can't get VA for ring", xbusd->xbusd_otherend); 528 "can't get VA for ring", xbusd->xbusd_otherend);
601 return -1; 529 return -1;
602 } 530 }
603 XENPRINTF(("xbdback %s: connect va 0x%" PRIxVADDR "\n", xbusd->xbusd_path, xbdi->xbdi_ring_va)); 531 XENPRINTF(("xbdback %s: connect va 0x%" PRIxVADDR "\n", xbusd->xbusd_path, xbdi->xbdi_ring_va));
604 532
605 grop.host_addr = xbdi->xbdi_ring_va; 533 grop.host_addr = xbdi->xbdi_ring_va;
606 grop.flags = GNTMAP_host_map; 534 grop.flags = GNTMAP_host_map;
607 grop.ref = ring_ref; 535 grop.ref = ring_ref;
608 grop.dom = xbdi->xbdi_domid; 536 grop.dom = xbdi->xbdi_domid;
609 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 537 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
610 &grop, 1); 538 &grop, 1);
611 if (err || grop.status) { 539 if (err || grop.status) {
612 aprint_error("xbdback %s: can't map grant ref: %d/%d\n", 540 aprint_error("xbdback %s: can't map grant ref: %d/%d\n",
613 xbusd->xbusd_path, err, grop.status); 541 xbusd->xbusd_path, err, grop.status);
614 xenbus_dev_fatal(xbusd, EINVAL, 542 xenbus_dev_fatal(xbusd, EINVAL,
615 "can't map ring", xbusd->xbusd_otherend); 543 "can't map ring", xbusd->xbusd_otherend);
616 goto err; 544 goto err;
617 } 545 }
618 xbdi->xbdi_ring_handle = grop.handle; 546 xbdi->xbdi_ring_handle = grop.handle;
619 XENPRINTF(("xbdback %s: connect grhandle %d\n", xbusd->xbusd_path, grop.handle)); 547 XENPRINTF(("xbdback %s: connect grhandle %d\n", xbusd->xbusd_path, grop.handle));
620 548
621 switch(xbdi->xbdi_proto) { 549 switch(xbdi->xbdi_proto) {
622 case XBDIP_NATIVE: 550 case XBDIP_NATIVE:
623 { 551 {
624 blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va; 552 blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va;
625 BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, sring, PAGE_SIZE); 553 BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, sring, PAGE_SIZE);
626 break; 554 break;
627 } 555 }
628 case XBDIP_32: 556 case XBDIP_32:
629 { 557 {
630 blkif_x86_32_sring_t *sring = (void *)xbdi->xbdi_ring_va; 558 blkif_x86_32_sring_t *sring = (void *)xbdi->xbdi_ring_va;
631 BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, sring, PAGE_SIZE); 559 BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, sring, PAGE_SIZE);
632 break; 560 break;
633 } 561 }
634 case XBDIP_64: 562 case XBDIP_64:
635 { 563 {
636 blkif_x86_64_sring_t *sring = (void *)xbdi->xbdi_ring_va; 564 blkif_x86_64_sring_t *sring = (void *)xbdi->xbdi_ring_va;
637 BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, sring, PAGE_SIZE); 565 BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, sring, PAGE_SIZE);
638 break; 566 break;
639 } 567 }
640 } 568 }
641 569
642 evop.cmd = EVTCHNOP_bind_interdomain; 570 evop.cmd = EVTCHNOP_bind_interdomain;
643 evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid; 571 evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid;
644 evop.u.bind_interdomain.remote_port = revtchn; 572 evop.u.bind_interdomain.remote_port = revtchn;
645 err = HYPERVISOR_event_channel_op(&evop); 573 err = HYPERVISOR_event_channel_op(&evop);
646 if (err) { 574 if (err) {
647 aprint_error("blkback %s: " 575 aprint_error("blkback %s: "
648 "can't get event channel: %d\n", 576 "can't get event channel: %d\n",
649 xbusd->xbusd_otherend, err); 577 xbusd->xbusd_otherend, err);
650 xenbus_dev_fatal(xbusd, err, 578 xenbus_dev_fatal(xbusd, err,
651 "can't bind event channel", xbusd->xbusd_otherend); 579 "can't bind event channel", xbusd->xbusd_otherend);
652 goto err2; 580 goto err2;
653 } 581 }
654 XENPRINTF(("xbdback %s: connect evchannel %d\n", xbusd->xbusd_path, xbdi->xbdi_evtchn)); 582 XENPRINTF(("xbdback %s: connect evchannel %d\n", xbusd->xbusd_path, xbdi->xbdi_evtchn));
655 xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port; 583 xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port;
656 584
657 xbdi->xbdi_ih = xen_intr_establish_xname(-1, &xen_pic, xbdi->xbdi_evtchn, 585 xbdi->xbdi_ih = xen_intr_establish_xname(-1, &xen_pic, xbdi->xbdi_evtchn,
658 IST_LEVEL, IPL_BIO, xbdback_evthandler, xbdi, false, 586 IST_LEVEL, IPL_BIO, xbdback_evthandler, xbdi, false,
659 xbdi->xbdi_name); 587 xbdi->xbdi_name);
660 KASSERT(xbdi->xbdi_ih != NULL); 588 KASSERT(xbdi->xbdi_ih != NULL);
661 aprint_verbose("xbd backend domain %d handle %#x (%d) " 589 aprint_verbose("xbd backend domain %d handle %#x (%d) "
662 "using event channel %d, protocol %s\n", xbdi->xbdi_domid, 590 "using event channel %d, protocol %s\n", xbdi->xbdi_domid,
663 xbdi->xbdi_handle, xbdi->xbdi_handle, xbdi->xbdi_evtchn, proto); 591 xbdi->xbdi_handle, xbdi->xbdi_handle, xbdi->xbdi_evtchn, proto);
664 592
665 /* enable the xbdback event handler machinery */ 593 /* enable the xbdback event handler machinery */
666 xbdi->xbdi_status = WAITING; 594 xbdi->xbdi_status = WAITING;
667 hypervisor_unmask_event(xbdi->xbdi_evtchn); 595 hypervisor_unmask_event(xbdi->xbdi_evtchn);
668 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn); 596 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
669 597
670 if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, 598 if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
671 xbdback_thread, xbdi, NULL, "%s", xbdi->xbdi_name) == 0) 599 xbdback_thread, xbdi, NULL, "%s", xbdi->xbdi_name) == 0)
672 return 0; 600 return 0;
673 601
674err2: 602err2:
675 /* unmap ring */ 603 /* unmap ring */
676 ungrop.host_addr = xbdi->xbdi_ring_va; 604 ungrop.host_addr = xbdi->xbdi_ring_va;
677 ungrop.handle = xbdi->xbdi_ring_handle; 605 ungrop.handle = xbdi->xbdi_ring_handle;
678 ungrop.dev_bus_addr = 0; 606 ungrop.dev_bus_addr = 0;
679 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 607 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
680 &ungrop, 1); 608 &ungrop, 1);
681 if (err) 609 if (err)
682 aprint_error("xbdback %s: unmap_grant_ref failed: %d\n", 610 aprint_error("xbdback %s: unmap_grant_ref failed: %d\n",
683 xbusd->xbusd_path, err); 611 xbusd->xbusd_path, err);
684 612
685err: 613err:
686 /* free ring VA space */ 614 /* free ring VA space */
687 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY); 615 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY);
688 return -1; 616 return -1;
689} 617}
690 618
691/* 619/*
692 * Signal a xbdback thread to disconnect. Done in 'xenwatch' thread context. 620 * Signal a xbdback thread to disconnect. Done in 'xenwatch' thread context.
693 */ 621 */
694static void 622static void
695xbdback_disconnect(struct xbdback_instance *xbdi) 623xbdback_disconnect(struct xbdback_instance *xbdi)
696{ 624{
697  625
698 mutex_enter(&xbdi->xbdi_lock); 626 mutex_enter(&xbdi->xbdi_lock);
699 if (xbdi->xbdi_status == DISCONNECTED) { 627 if (xbdi->xbdi_status == DISCONNECTED) {
700 mutex_exit(&xbdi->xbdi_lock); 628 mutex_exit(&xbdi->xbdi_lock);
701 return; 629 return;
702 } 630 }
703 hypervisor_mask_event(xbdi->xbdi_evtchn); 631 hypervisor_mask_event(xbdi->xbdi_evtchn);
704 xen_intr_disestablish(xbdi->xbdi_ih); 632 xen_intr_disestablish(xbdi->xbdi_ih);
705 633
706 /* signal thread that we want to disconnect, then wait for it */ 634 /* signal thread that we want to disconnect, then wait for it */
707 xbdi->xbdi_status = DISCONNECTING; 635 xbdi->xbdi_status = DISCONNECTING;
708 cv_signal(&xbdi->xbdi_cv); 636 cv_signal(&xbdi->xbdi_cv);
709 637
710 while (xbdi->xbdi_status != DISCONNECTED) 638 while (xbdi->xbdi_status != DISCONNECTED)
711 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); 639 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
712 640
713 mutex_exit(&xbdi->xbdi_lock); 641 mutex_exit(&xbdi->xbdi_lock);
714 642
715 xenbus_switch_state(xbdi->xbdi_xbusd, NULL, XenbusStateClosing); 643 xenbus_switch_state(xbdi->xbdi_xbusd, NULL, XenbusStateClosing);
716} 644}
717 645
718static void 646static void
719xbdback_frontend_changed(void *arg, XenbusState new_state) 647xbdback_frontend_changed(void *arg, XenbusState new_state)
720{ 648{
721 struct xbdback_instance *xbdi = arg; 649 struct xbdback_instance *xbdi = arg;
722 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 650 struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
723 651
724 XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state)); 652 XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state));
725 switch(new_state) { 653 switch(new_state) {
726 case XenbusStateInitialising: 654 case XenbusStateInitialising:
727 break; 655 break;
728 case XenbusStateInitialised: 656 case XenbusStateInitialised:
729 case XenbusStateConnected: 657 case XenbusStateConnected:
730 if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) 658 if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN)
731 break; 659 break;
732 xbdback_connect(xbdi); 660 xbdback_connect(xbdi);
733 break; 661 break;
734 case XenbusStateClosing: 662 case XenbusStateClosing:
735 xbdback_disconnect(xbdi); 663 xbdback_disconnect(xbdi);
736 break; 664 break;
737 case XenbusStateClosed: 665 case XenbusStateClosed:
738 /* otherend_changed() should handle it for us */ 666 /* otherend_changed() should handle it for us */
739 panic("xbdback_frontend_changed: closed\n"); 667 panic("xbdback_frontend_changed: closed\n");
740 case XenbusStateUnknown: 668 case XenbusStateUnknown:
741 case XenbusStateInitWait: 669 case XenbusStateInitWait:
742 default: 670 default:
743 aprint_error("xbdback %s: invalid frontend state %d\n", 671 aprint_error("xbdback %s: invalid frontend state %d\n",
744 xbusd->xbusd_path, new_state); 672 xbusd->xbusd_path, new_state);
745 } 673 }
746 return; 674 return;
747} 675}
748 676
749static void 677static void
750xbdback_backend_changed(struct xenbus_watch *watch, 678xbdback_backend_changed(struct xenbus_watch *watch,
751 const char **vec, unsigned int len) 679 const char **vec, unsigned int len)
752{ 680{
753 struct xenbus_device *xbusd = watch->xbw_dev; 681 struct xenbus_device *xbusd = watch->xbw_dev;
754 struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie; 682 struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie;
755 int err; 683 int err;
756 long dev; 684 long dev;
757 char mode[32]; 685 char mode[32];
758 struct xenbus_transaction *xbt; 686 struct xenbus_transaction *xbt;
759 const char *devname; 687 const char *devname;
760 int major; 688 int major;
761 689
762 err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device", 690 err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device",
763 &dev, 10); 691 &dev, 10);
764 /* 692 /*
765 * An error can occur as the watch can fire up just after being 693 * An error can occur as the watch can fire up just after being
766 * registered. So we have to ignore error :( 694 * registered. So we have to ignore error :(
767 */ 695 */
768 if (err) 696 if (err)
769 return; 697 return;
770 /* 698 /*
771 * we can also fire up after having opened the device, don't try 699 * we can also fire up after having opened the device, don't try
772 * to do it twice. 700 * to do it twice.
773 */ 701 */
774 if (xbdi->xbdi_vp != NULL) { 702 if (xbdi->xbdi_vp != NULL) {
775 if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) { 703 if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) {
776 if (xbdi->xbdi_dev != dev) { 704 if (xbdi->xbdi_dev != dev) {
777 printf("xbdback %s: changing physical device " 705 printf("xbdback %s: changing physical device "
778 "from %#"PRIx64" to %#lx not supported\n", 706 "from %#"PRIx64" to %#lx not supported\n",
779 xbusd->xbusd_path, xbdi->xbdi_dev, dev); 707 xbusd->xbusd_path, xbdi->xbdi_dev, dev);
780 } 708 }
781 } 709 }
782 return; 710 return;
783 } 711 }
784 xbdi->xbdi_dev = dev; 712 xbdi->xbdi_dev = dev;
785 err = xenbus_read(NULL, xbusd->xbusd_path, "mode", mode, sizeof(mode)); 713 err = xenbus_read(NULL, xbusd->xbusd_path, "mode", mode, sizeof(mode));
786 if (err) { 714 if (err) {
787 printf("xbdback: failed to read %s/mode: %d\n", 715 printf("xbdback: failed to read %s/mode: %d\n",
788 xbusd->xbusd_path, err); 716 xbusd->xbusd_path, err);
789 return; 717 return;
790 } 718 }
791 if (mode[0] == 'w') 719 if (mode[0] == 'w')
792 xbdi->xbdi_ro = false; 720 xbdi->xbdi_ro = false;
793 else 721 else
794 xbdi->xbdi_ro = true; 722 xbdi->xbdi_ro = true;
795 major = major(xbdi->xbdi_dev); 723 major = major(xbdi->xbdi_dev);
796 devname = devsw_blk2name(major); 724 devname = devsw_blk2name(major);
797 if (devname == NULL) { 725 if (devname == NULL) {
798 printf("xbdback %s: unknown device 0x%"PRIx64"\n", 726 printf("xbdback %s: unknown device 0x%"PRIx64"\n",
799 xbusd->xbusd_path, xbdi->xbdi_dev); 727 xbusd->xbusd_path, xbdi->xbdi_dev);
800 return; 728 return;
801 } 729 }
802 xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev); 730 xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev);
803 if (xbdi->xbdi_bdevsw == NULL) { 731 if (xbdi->xbdi_bdevsw == NULL) {
804 printf("xbdback %s: no bdevsw for device 0x%"PRIx64"\n", 732 printf("xbdback %s: no bdevsw for device 0x%"PRIx64"\n",
805 xbusd->xbusd_path, xbdi->xbdi_dev); 733 xbusd->xbusd_path, xbdi->xbdi_dev);
806 return; 734 return;
807 } 735 }
808 err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp); 736 err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp);
809 if (err) { 737 if (err) {
810 printf("xbdback %s: can't open device 0x%"PRIx64": %d\n", 738 printf("xbdback %s: can't open device 0x%"PRIx64": %d\n",
811 xbusd->xbusd_path, xbdi->xbdi_dev, err); 739 xbusd->xbusd_path, xbdi->xbdi_dev, err);
812 return; 740 return;
813 } 741 }
814 err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY); 742 err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY);
815 if (err) { 743 if (err) {
816 printf("xbdback %s: can't vn_lock device 0x%"PRIx64": %d\n", 744 printf("xbdback %s: can't vn_lock device 0x%"PRIx64": %d\n",
817 xbusd->xbusd_path, xbdi->xbdi_dev, err); 745 xbusd->xbusd_path, xbdi->xbdi_dev, err);
818 vrele(xbdi->xbdi_vp); 746 vrele(xbdi->xbdi_vp);
819 return; 747 return;
820 } 748 }
821 err = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED); 749 err = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED);
822 if (err) { 750 if (err) {
823 printf("xbdback %s: can't VOP_OPEN device 0x%"PRIx64": %d\n", 751 printf("xbdback %s: can't VOP_OPEN device 0x%"PRIx64": %d\n",
824 xbusd->xbusd_path, xbdi->xbdi_dev, err); 752 xbusd->xbusd_path, xbdi->xbdi_dev, err);
825 vput(xbdi->xbdi_vp); 753 vput(xbdi->xbdi_vp);
826 return; 754 return;
827 } 755 }
828 VOP_UNLOCK(xbdi->xbdi_vp); 756 VOP_UNLOCK(xbdi->xbdi_vp);
829 757
830 /* dk device; get wedge data */ 758 /* dk device; get wedge data */
831 struct dkwedge_info wi; 759 struct dkwedge_info wi;
832 if ((err = getdiskinfo(xbdi->xbdi_vp, &wi)) == 0) { 760 if ((err = getdiskinfo(xbdi->xbdi_vp, &wi)) == 0) {
833 xbdi->xbdi_size = wi.dkw_size; 761 xbdi->xbdi_size = wi.dkw_size;
834 printf("xbd backend: attach device %s (size %" PRIu64 ") " 762 printf("xbd backend: attach device %s (size %" PRIu64 ") "
835 "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size, 763 "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size,
836 xbdi->xbdi_domid); 764 xbdi->xbdi_domid);
837 } else { 765 } else {
838 /* If both Ioctls failed set device size to 0 and return */ 766 /* If both Ioctls failed set device size to 0 and return */
839 printf("xbdback %s: can't DIOCGWEDGEINFO device " 767 printf("xbdback %s: can't DIOCGWEDGEINFO device "
840 "0x%"PRIx64": %d\n", xbusd->xbusd_path, 768 "0x%"PRIx64": %d\n", xbusd->xbusd_path,
841 xbdi->xbdi_dev, err);  769 xbdi->xbdi_dev, err);
842 xbdi->xbdi_size = xbdi->xbdi_dev = 0; 770 xbdi->xbdi_size = xbdi->xbdi_dev = 0;
843 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 771 vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
844 xbdi->xbdi_vp = NULL; 772 xbdi->xbdi_vp = NULL;
845 return; 773 return;
846 } 774 }
847again: 775again:
848 xbt = xenbus_transaction_start(); 776 xbt = xenbus_transaction_start();
849 if (xbt == NULL) { 777 if (xbt == NULL) {
850 printf("xbdback %s: can't start transaction\n", 778 printf("xbdback %s: can't start transaction\n",
851 xbusd->xbusd_path); 779 xbusd->xbusd_path);
852 return; 780 return;
853 } 781 }
854 err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 , 782 err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 ,
855 xbdi->xbdi_size); 783 xbdi->xbdi_size);
856 if (err) { 784 if (err) {
857 printf("xbdback: failed to write %s/sectors: %d\n", 785 printf("xbdback: failed to write %s/sectors: %d\n",
858 xbusd->xbusd_path, err); 786 xbusd->xbusd_path, err);
859 goto abort; 787 goto abort;
860 } 788 }
861 err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u", 789 err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u",
862 xbdi->xbdi_ro ? VDISK_READONLY : 0); 790 xbdi->xbdi_ro ? VDISK_READONLY : 0);
863 if (err) { 791 if (err) {
864 printf("xbdback: failed to write %s/info: %d\n", 792 printf("xbdback: failed to write %s/info: %d\n",
865 xbusd->xbusd_path, err); 793 xbusd->xbusd_path, err);
866 goto abort; 794 goto abort;
867 } 795 }
868 err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu", 796 err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu",
869 (u_long)DEV_BSIZE); 797 (u_long)DEV_BSIZE);
870 if (err) { 798 if (err) {
871 printf("xbdback: failed to write %s/sector-size: %d\n", 799 printf("xbdback: failed to write %s/sector-size: %d\n",
872 xbusd->xbusd_path, err); 800 xbusd->xbusd_path, err);
873 goto abort; 801 goto abort;
874 } 802 }
875 err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache", 803 err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache",
876 "%u", 1); 804 "%u", 1);
877 if (err) { 805 if (err) {
878 printf("xbdback: failed to write %s/feature-flush-cache: %d\n", 806 printf("xbdback: failed to write %s/feature-flush-cache: %d\n",
879 xbusd->xbusd_path, err); 807 xbusd->xbusd_path, err);
880 goto abort; 808 goto abort;
881 } 809 }
882 err = xenbus_transaction_end(xbt, 0); 810 err = xenbus_transaction_end(xbt, 0);
883 if (err == EAGAIN) 811 if (err == EAGAIN)
884 goto again; 812 goto again;
885 if (err) { 813 if (err) {
886 printf("xbdback %s: can't end transaction: %d\n", 814 printf("xbdback %s: can't end transaction: %d\n",
887 xbusd->xbusd_path, err); 815 xbusd->xbusd_path, err);
888 } 816 }
889 err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected); 817 err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
890 if (err) { 818 if (err) {
891 printf("xbdback %s: can't switch state: %d\n", 819 printf("xbdback %s: can't switch state: %d\n",
892 xbusd->xbusd_path, err); 820 xbusd->xbusd_path, err);
893 } 821 }
894 return; 822 return;
895abort: 823abort:
896 xenbus_transaction_end(xbt, 1); 824 xenbus_transaction_end(xbt, 1);
897} 825}
898 826
899/* 827/*
900 * Used by a xbdi thread to signal that it is now disconnected. 828 * Used by a xbdi thread to signal that it is now disconnected.
901 */ 829 */
902static void 830static void
903xbdback_finish_disconnect(struct xbdback_instance *xbdi) 831xbdback_finish_disconnect(struct xbdback_instance *xbdi)
904{ 832{
905 KASSERT(mutex_owned(&xbdi->xbdi_lock)); 833 KASSERT(mutex_owned(&xbdi->xbdi_lock));
906 KASSERT(xbdi->xbdi_status == DISCONNECTING); 834 KASSERT(xbdi->xbdi_status == DISCONNECTING);
907 835
908 xbdi->xbdi_status = DISCONNECTED; 836 xbdi->xbdi_status = DISCONNECTED;
909 837
910 cv_signal(&xbdi->xbdi_cv); 838 cv_signal(&xbdi->xbdi_cv);
911} 839}
912 840
913static bool 841static bool
914xbdif_lookup(domid_t dom , uint32_t handle) 842xbdif_lookup(domid_t dom , uint32_t handle)
915{ 843{
916 struct xbdback_instance *xbdi; 844 struct xbdback_instance *xbdi;
917 bool found = false; 845 bool found = false;
918 846
919 mutex_enter(&xbdback_lock); 847 mutex_enter(&xbdback_lock);
920 SLIST_FOREACH(xbdi, &xbdback_instances, next) { 848 SLIST_FOREACH(xbdi, &xbdback_instances, next) {
921 if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) { 849 if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) {
922 found = true; 850 found = true;
923 break; 851 break;
924 } 852 }
925 } 853 }
926 mutex_exit(&xbdback_lock); 854 mutex_exit(&xbdback_lock);
927 855
928 return found; 856 return found;
929} 857}
930 858
931static int 859static int
932xbdback_evthandler(void *arg) 860xbdback_evthandler(void *arg)
933{ 861{
934 struct xbdback_instance *xbdi = arg; 862 struct xbdback_instance *xbdi = arg;
935 863
936 XENPRINTF(("xbdback_evthandler domain %d: cont %p\n", 864 XENPRINTF(("xbdback_evthandler domain %d: cont %p\n",
937 xbdi->xbdi_domid, xbdi->xbdi_cont)); 865 xbdi->xbdi_domid, xbdi->xbdi_cont));
938 866
939 xbdback_wakeup_thread(xbdi); 867 xbdback_wakeup_thread(xbdi);
940 868
941 return 1; 869 return 1;
942} 870}
943 871
944/* 872/*
945 * Main thread routine for one xbdback instance. Woken up by 873 * Main thread routine for one xbdback instance. Woken up by
946 * xbdback_evthandler when a domain has I/O work scheduled in a I/O ring. 874 * xbdback_evthandler when a domain has I/O work scheduled in a I/O ring.
947 */ 875 */
948static void 876static void
949xbdback_thread(void *arg) 877xbdback_thread(void *arg)
950{ 878{
951 struct xbdback_instance *xbdi = arg; 879 struct xbdback_instance *xbdi = arg;
952 880
953 for (;;) { 881 for (;;) {
954 mutex_enter(&xbdi->xbdi_lock); 882 mutex_enter(&xbdi->xbdi_lock);
955 switch (xbdi->xbdi_status) { 883 switch (xbdi->xbdi_status) {
956 case WAITING: 884 case WAITING:
957 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); 885 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
958 mutex_exit(&xbdi->xbdi_lock); 886 mutex_exit(&xbdi->xbdi_lock);
959 break; 887 break;
960 case RUN: 888 case RUN:
961 xbdi->xbdi_status = WAITING; /* reset state */ 889 xbdi->xbdi_status = WAITING; /* reset state */
962 mutex_exit(&xbdi->xbdi_lock); 890 mutex_exit(&xbdi->xbdi_lock);
963 891
964 if (xbdi->xbdi_cont == NULL) { 892 if (xbdi->xbdi_cont == NULL) {
965 xbdi->xbdi_cont = xbdback_co_main; 893 xbdi->xbdi_cont = xbdback_co_main;
966 } 894 }
967 895
968 xbdback_trampoline(xbdi, xbdi); 896 xbdback_trampoline(xbdi, xbdi);
969 break; 897 break;
970 case DISCONNECTING: 898 case DISCONNECTING:
971 if (xbdi->xbdi_pendingreqs > 0) { 899 if (xbdi->xbdi_pendingreqs > 0) {
972 /* there are pending I/Os. Wait for them. */ 900 /* there are pending I/Os. Wait for them. */
973 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock); 901 cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
974 mutex_exit(&xbdi->xbdi_lock); 902 mutex_exit(&xbdi->xbdi_lock);
975 break; 903 break;
976 } 904 }
977  905
978 /* All I/Os should have been processed by now, 906 /* All I/Os should have been processed by now,
979 * xbdi_refcnt should drop to 0 */ 907 * xbdi_refcnt should drop to 0 */
980 xbdi_put(xbdi); 908 xbdi_put(xbdi);
981 KASSERT(xbdi->xbdi_refcnt == 0); 909 KASSERT(xbdi->xbdi_refcnt == 0);
982 mutex_exit(&xbdi->xbdi_lock); 910 mutex_exit(&xbdi->xbdi_lock);
983 kthread_exit(0); 911 kthread_exit(0);
984 break; 912 break;
985 default: 913 default:
986 panic("%s: invalid state %d", 914 panic("%s: invalid state %d",
987 xbdi->xbdi_name, xbdi->xbdi_status); 915 xbdi->xbdi_name, xbdi->xbdi_status);
988 } 916 }
989 } 917 }
990} 918}
991 919
992static void * 920static void *
993xbdback_co_main(struct xbdback_instance *xbdi, void *obj) 921xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
994{ 922{
995 (void)obj; 923 (void)obj;
996 924
997 xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod; 925 xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod;
998 xen_rmb(); /* ensure we see all requests up to req_prod */ 926 xen_rmb(); /* ensure we see all requests up to req_prod */
999 /* 927 /*
1000 * note that we'll eventually get a full ring of request. 928 * note that we'll eventually get a full ring of request.
1001 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod) 929 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
1002 */ 930 */
1003 xbdi->xbdi_cont = xbdback_co_main_loop; 931 xbdi->xbdi_cont = xbdback_co_main_loop;
1004 return xbdi; 932 return xbdi;
1005} 933}
1006 934
1007/* 935/*
1008 * Fetch a blkif request from the ring, and pass control to the appropriate 936 * Fetch a blkif request from the ring, and pass control to the appropriate
1009 * continuation. 937 * continuation.
1010 * If someone asked for disconnection, do not fetch any more request from 938 * If someone asked for disconnection, do not fetch any more request from
1011 * the ring. 939 * the ring.
1012 */ 940 */
1013static void * 941static void *
1014xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj)  942xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj)
1015{ 943{
1016 blkif_request_t *req; 944 blkif_request_t *req;
1017 blkif_x86_32_request_t *req32; 945 blkif_x86_32_request_t *req32;
1018 blkif_x86_64_request_t *req64; 946 blkif_x86_64_request_t *req64;
1019 947
1020 (void)obj; 948 (void)obj;
1021 req = &xbdi->xbdi_xen_req; 949 req = &xbdi->xbdi_xen_req;
1022 if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) { 950 if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
1023 switch(xbdi->xbdi_proto) { 951 switch(xbdi->xbdi_proto) {
1024 case XBDIP_NATIVE: 952 case XBDIP_NATIVE:
1025 memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, 953 memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
1026 xbdi->xbdi_ring.ring_n.req_cons), 954 xbdi->xbdi_ring.ring_n.req_cons),
1027 sizeof(blkif_request_t)); 955 sizeof(blkif_request_t));
1028 break; 956 break;
1029 case XBDIP_32: 957 case XBDIP_32:
1030 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, 958 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
1031 xbdi->xbdi_ring.ring_n.req_cons); 959 xbdi->xbdi_ring.ring_n.req_cons);
1032 req->operation = req32->operation; 960 req->operation = req32->operation;
1033 req->nr_segments = req32->nr_segments; 961 req->nr_segments = req32->nr_segments;
1034 req->handle = req32->handle; 962 req->handle = req32->handle;
1035 req->id = req32->id; 963 req->id = req32->id;
1036 req->sector_number = req32->sector_number; 964 req->sector_number = req32->sector_number;
1037 break; 965 break;
1038  966
1039 case XBDIP_64: 967 case XBDIP_64:
1040 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, 968 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
1041 xbdi->xbdi_ring.ring_n.req_cons); 969 xbdi->xbdi_ring.ring_n.req_cons);
1042 req->operation = req64->operation; 970 req->operation = req64->operation;
1043 req->nr_segments = req64->nr_segments; 971 req->nr_segments = req64->nr_segments;
1044 req->handle = req64->handle; 972 req->handle = req64->handle;
1045 req->id = req64->id; 973 req->id = req64->id;
1046 req->sector_number = req64->sector_number; 974 req->sector_number = req64->sector_number;
1047 break; 975 break;
1048 } 976 }
1049 __insn_barrier(); 977 __insn_barrier();
1050 XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x " 978 XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x "
1051 "resp_prod 0x%x id %" PRIu64 "\n", req->operation, 979 "resp_prod 0x%x id %" PRIu64 "\n", req->operation,
1052 xbdi->xbdi_ring.ring_n.req_cons, 980 xbdi->xbdi_ring.ring_n.req_cons,
1053 xbdi->xbdi_req_prod, 981 xbdi->xbdi_req_prod,
1054 xbdi->xbdi_ring.ring_n.rsp_prod_pvt, 982 xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
1055 req->id)); 983 req->id));
1056 switch(req->operation) { 984 switch(req->operation) {
1057 case BLKIF_OP_READ: 985 case BLKIF_OP_READ:
1058 case BLKIF_OP_WRITE: 986 case BLKIF_OP_WRITE:
1059 xbdi->xbdi_cont = xbdback_co_io; 987 xbdi->xbdi_cont = xbdback_co_io;
1060 break; 988 break;
1061 case BLKIF_OP_FLUSH_DISKCACHE: 989 case BLKIF_OP_FLUSH_DISKCACHE:
1062 xbdi_get(xbdi); 990 xbdi_get(xbdi);
1063 xbdi->xbdi_cont = xbdback_co_cache_flush; 991 xbdi->xbdi_cont = xbdback_co_cache_flush;
1064 break; 992 break;
1065 default: 993 default:
1066 if (ratecheck(&xbdi->xbdi_lasterr_time, 994 if (ratecheck(&xbdi->xbdi_lasterr_time,
1067 &xbdback_err_intvl)) { 995 &xbdback_err_intvl)) {
1068 printf("%s: unknown operation %d\n", 996 printf("%s: unknown operation %d\n",
1069 xbdi->xbdi_name, req->operation); 997 xbdi->xbdi_name, req->operation);
1070 } 998 }
1071 xbdback_send_reply(xbdi, req->id, req->operation, 999 xbdback_send_reply(xbdi, req->id, req->operation,
1072 BLKIF_RSP_ERROR); 1000 BLKIF_RSP_ERROR);
1073 xbdi->xbdi_cont = xbdback_co_main_incr; 1001 xbdi->xbdi_cont = xbdback_co_main_incr;
1074 break; 1002 break;
1075 } 1003 }
1076 } else { 1004 } else {
1077 xbdi->xbdi_cont = xbdback_co_main_done; 1005 KASSERT(xbdi->xbdi_io == NULL);
 1006 xbdi->xbdi_cont = xbdback_co_main_done2;
1078 } 1007 }
1079 return xbdi; 1008 return xbdi;
1080} 1009}
1081 1010
1082/* 1011/*
1083 * Increment consumer index and move on to the next request. In case 1012 * Increment consumer index and move on to the next request. In case
1084 * we want to disconnect, leave continuation now. 1013 * we want to disconnect, leave continuation now.
1085 */ 1014 */
1086static void * 1015static void *
1087xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj) 1016xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj)
1088{ 1017{
1089 (void)obj; 1018 (void)obj;
1090 blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n; 1019 blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n;
1091 1020
1092 ring->req_cons++; 1021 ring->req_cons++;
1093 1022
1094 /* 1023 /*
1095 * Do not bother with locking here when checking for xbdi_status: if 1024 * Do not bother with locking here when checking for xbdi_status: if
1096 * we get a transient state, we will get the right value at 1025 * we get a transient state, we will get the right value at
1097 * the next increment. 1026 * the next increment.
1098 */ 1027 */
1099 if (xbdi->xbdi_status == DISCONNECTING) 1028 if (xbdi->xbdi_status == DISCONNECTING)
1100 xbdi->xbdi_cont = NULL; 1029 xbdi->xbdi_cont = NULL;
1101 else 1030 else
1102 xbdi->xbdi_cont = xbdback_co_main_loop; 1031 xbdi->xbdi_cont = xbdback_co_main_loop;
1103 1032
1104 /* 1033 /*
1105 * Each time the thread processes a full ring of requests, give 1034 * Each time the thread processes a full ring of requests, give
1106 * a chance to other threads to process I/Os too 1035 * a chance to other threads to process I/Os too
1107 */ 1036 */
1108 if ((ring->req_cons % BLKIF_RING_SIZE) == 0) 1037 if ((ring->req_cons % BLKIF_RING_SIZE) == 0)
1109 yield(); 1038 yield();
1110 1039
1111 return xbdi; 1040 return xbdi;
1112} 1041}
1113 1042
1114/* 1043/*
1115 * Ring processing is over. If there are any I/O still present for this 
1116 * instance, handle them first. 
1117 */ 
1118static void * 
1119xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj) 
1120{ 
1121 (void)obj; 
1122 if (xbdi->xbdi_io != NULL) { 
1123 KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ || 
1124 xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE); 
1125 xbdi->xbdi_cont = xbdback_co_map_io; 
1126 xbdi->xbdi_cont_aux = xbdback_co_main_done2; 
1127 } else { 
1128 xbdi->xbdi_cont = xbdback_co_main_done2; 
1129 } 
1130 return xbdi; 
1131} 
1132 
1133/* 
1134 * Check for requests in the instance's ring. In case there are, start again 1044 * Check for requests in the instance's ring. In case there are, start again
1135 * from the beginning. If not, stall. 1045 * from the beginning. If not, stall.
1136 */ 1046 */
1137static void * 1047static void *
1138xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj) 1048xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
1139{ 1049{
1140 int work_to_do; 1050 int work_to_do;
1141 1051
 1052 KASSERT(xbdi->xbdio_io == NULL);
1142 RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do); 1053 RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
1143 if (work_to_do) 1054 if (work_to_do)
1144 xbdi->xbdi_cont = xbdback_co_main; 1055 xbdi->xbdi_cont = xbdback_co_main;
1145 else 1056 else
1146 xbdi->xbdi_cont = NULL; 1057 xbdi->xbdi_cont = NULL;
1147 1058
1148 return xbdi; 1059 return xbdi;
1149} 1060}
1150 1061
1151/* 1062/*
1152 * Frontend requested a cache flush operation. 1063 * Frontend requested a cache flush operation.
1153 */ 1064 */
1154static void * 1065static void *
1155xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj) 1066xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj __unused)
1156{ 
1157 (void)obj; 
1158 
1159 XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj)); 
1160 if (xbdi->xbdi_io != NULL) { 
1161 /* Some I/Os are required for this instance. Process them. */ 
1162 KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ || 
1163 xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE); 
1164 KASSERT(xbdi->xbdi_pendingreqs > 0); 
1165 xbdi->xbdi_cont = xbdback_co_map_io; 
1166 xbdi->xbdi_cont_aux = xbdback_co_cache_flush2; 
1167 } else { 
1168 xbdi->xbdi_cont = xbdback_co_cache_flush2; 
1169 } 
1170 return xbdi; 
1171} 
1172 
1173static void * 
1174xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj) 
1175{ 1067{
1176 (void)obj; 
1177 XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj)); 
1178 if (xbdi->xbdi_pendingreqs > 0) { 1068 if (xbdi->xbdi_pendingreqs > 0) {
1179 /* 1069 /*
1180 * There are pending requests. 1070 * There are pending requests.
1181 * Event or iodone() will restart processing 1071 * Event or iodone() will restart processing
1182 */ 1072 */
1183 xbdi->xbdi_cont = NULL; 1073 xbdi->xbdi_cont = NULL;
1184 xbdi_put(xbdi); 1074 xbdi_put(xbdi);
1185 return NULL; 1075 return NULL;
1186 } 1076 }
1187 xbdi->xbdi_cont = xbdback_co_cache_doflush; 1077 xbdi->xbdi_cont = xbdback_co_cache_doflush;
1188 return xbdback_pool_get(&xbdback_io_pool, xbdi); 1078 return xbdback_pool_get(&xbdback_io_pool, xbdi);
1189} 1079}
1190 1080
1191/* Start the flush work */ 1081/* Start the flush work */
1192static void * 1082static void *
1193xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj) 1083xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj)
1194{ 1084{
1195 struct xbdback_io *xbd_io; 1085 struct xbdback_io *xbd_io;
1196 1086
1197 XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj)); 1087 XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj));
1198 xbd_io = xbdi->xbdi_io = obj; 1088 xbd_io = xbdi->xbdi_io = obj;
1199 xbd_io->xio_xbdi = xbdi; 1089 xbd_io->xio_xbdi = xbdi;
1200 xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; 1090 xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
1201 xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id; 1091 xbd_io->xio_id = xbdi->xbdi_xen_req.id;
1202 xbdi->xbdi_cont = xbdback_co_do_io; 1092 xbdi->xbdi_cont = xbdback_co_do_io;
1203 return xbdi; 1093 return xbdi;
1204} 1094}
1205 1095
1206/* 1096/*
1207 * A read or write I/O request must be processed. Do some checks first, 1097 * A read or write I/O request must be processed. Do some checks first,
1208 * then get the segment information directly from the ring request. 1098 * then get the segment information directly from the ring request.
1209 */ 1099 */
1210static void * 1100static void *
1211xbdback_co_io(struct xbdback_instance *xbdi, void *obj) 1101xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
1212{  1102{
1213 int i, error; 1103 int i, error;
1214 blkif_request_t *req; 1104 blkif_request_t *req;
1215 blkif_x86_32_request_t *req32; 1105 blkif_x86_32_request_t *req32;
1216 blkif_x86_64_request_t *req64; 1106 blkif_x86_64_request_t *req64;
1217 1107
1218 (void)obj; 1108 (void)obj;
1219 1109
1220 /* some sanity checks */ 1110 /* some sanity checks */
1221 req = &xbdi->xbdi_xen_req; 1111 req = &xbdi->xbdi_xen_req;
1222 if (req->nr_segments < 1 || 1112 if (req->nr_segments < 1 ||
1223 req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) { 1113 req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
1224 if (ratecheck(&xbdi->xbdi_lasterr_time, 1114 if (ratecheck(&xbdi->xbdi_lasterr_time,
1225 &xbdback_err_intvl)) { 1115 &xbdback_err_intvl)) {
1226 printf("%s: invalid number of segments: %d\n", 1116 printf("%s: invalid number of segments: %d\n",
1227 xbdi->xbdi_name, 1117 xbdi->xbdi_name,
1228 xbdi->xbdi_xen_req.nr_segments); 1118 xbdi->xbdi_xen_req.nr_segments);
1229 } 1119 }
1230 error = EINVAL; 1120 error = EINVAL;
1231 goto end; 1121 goto end;
1232 } 1122 }
1233 1123
1234 KASSERT(req->operation == BLKIF_OP_READ || 1124 KASSERT(req->operation == BLKIF_OP_READ ||
1235 req->operation == BLKIF_OP_WRITE); 1125 req->operation == BLKIF_OP_WRITE);
1236 if (req->operation == BLKIF_OP_WRITE) { 1126 if (req->operation == BLKIF_OP_WRITE) {
1237 if (xbdi->xbdi_ro) { 1127 if (xbdi->xbdi_ro) {
1238 error = EROFS; 1128 error = EROFS;
1239 goto end; 1129 goto end;
1240 } 1130 }
1241 } 1131 }
1242 1132
1243 xbdi->xbdi_segno = 0; 
1244 
1245 /* copy request segments */ 1133 /* copy request segments */
1246 switch(xbdi->xbdi_proto) { 1134 switch(xbdi->xbdi_proto) {
1247 case XBDIP_NATIVE: 1135 case XBDIP_NATIVE:
1248 /* already copied in xbdback_co_main_loop */ 1136 /* already copied in xbdback_co_main_loop */
1249 break; 1137 break;
1250 case XBDIP_32: 1138 case XBDIP_32:
1251 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, 1139 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
1252 xbdi->xbdi_ring.ring_n.req_cons); 1140 xbdi->xbdi_ring.ring_n.req_cons);
1253 for (i = 0; i < req->nr_segments; i++) 1141 for (i = 0; i < req->nr_segments; i++)
1254 req->seg[i] = req32->seg[i]; 1142 req->seg[i] = req32->seg[i];
1255 break; 1143 break;
1256 case XBDIP_64: 1144 case XBDIP_64:
1257 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, 1145 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
1258 xbdi->xbdi_ring.ring_n.req_cons); 1146 xbdi->xbdi_ring.ring_n.req_cons);
1259 for (i = 0; i < req->nr_segments; i++) 1147 for (i = 0; i < req->nr_segments; i++)
1260 req->seg[i] = req64->seg[i]; 1148 req->seg[i] = req64->seg[i];
1261 break; 1149 break;
1262 } 1150 }
1263 1151
1264 xbdi->xbdi_cont = xbdback_co_io_gotreq; 1152 KASSERT(xbdi->xbdi_io == NULL);
1265 return xbdback_pool_get(&xbdback_request_pool, xbdi); 1153 xbdi->xbdi_cont = xbdback_co_io_gotio;
 1154 return xbdback_pool_get(&xbdback_io_pool, xbdi);
1266 1155
1267 end: 1156 end:
1268 xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id, 1157 xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
1269 xbdi->xbdi_xen_req.operation, error); 1158 xbdi->xbdi_xen_req.operation,
 1159 (error == EROFS) ? BLKIF_RSP_EOPNOTSUPP : BLKIF_RSP_ERROR);
1270 xbdi->xbdi_cont = xbdback_co_main_incr; 1160 xbdi->xbdi_cont = xbdback_co_main_incr;
1271 return xbdi; 1161 return xbdi;
1272} 1162}
1273 1163
1274/* 
1275 * We have fetched segment requests from the ring. In case there are already 
1276 * I/Os prepared for this instance, we can try coalescing the requests 
1277 * with these I/Os. 
1278 */ 
1279static void * 
1280xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj) 
1281{ 
1282 struct xbdback_request *xrq; 
1283 
1284 xrq = xbdi->xbdi_req = obj; 
1285  
1286 xrq->rq_xbdi = xbdi; 
1287 xrq->rq_iocount = 0; 
1288 xrq->rq_ioerrs = 0; 
1289 xrq->rq_id = xbdi->xbdi_xen_req.id; 
1290 xrq->rq_operation = xbdi->xbdi_xen_req.operation; 
1291 KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ || 
1292 xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE); 
1293 
1294 /*  
1295 * Request-level reasons not to coalesce: different device, 
1296 * different op, or noncontiguous disk sectors (vs. previous 
1297 * request handed to us). 
1298 */ 
1299 xbdi->xbdi_cont = xbdback_co_io_loop; 
1300 if (xbdi->xbdi_io != NULL) { 
1301 struct xbdback_request *last_req; 
1302 last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car; 
1303 XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64 
1304 "; got %" PRIu64 "\n", xbdi->xbdi_domid, 
1305 xbdi->xbdi_next_sector, 
1306 xbdi->xbdi_xen_req.sector_number)); 
1307 if ((xrq->rq_operation != last_req->rq_operation) 
1308 || (xbdi->xbdi_xen_req.sector_number != 
1309 xbdi->xbdi_next_sector)) { 
1310 XENPRINTF(("xbdback_io domain %d: segment break\n", 
1311 xbdi->xbdi_domid)); 
1312 xbdi->xbdi_next_sector = 
1313 xbdi->xbdi_xen_req.sector_number; 
1314 KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ || 
1315 xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE); 
1316 xbdi->xbdi_cont_aux = xbdback_co_io_loop; 
1317 xbdi->xbdi_cont = xbdback_co_map_io; 
1318 } 
1319 } else { 
1320 xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number; 
1321 } 
1322 return xbdi; 
1323} 
1324 
1325/* Handle coalescing of multiple segment requests into one I/O work */ 
1326static void * 
1327xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj) 
1328{ 
1329 (void)obj; 
1330 KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ || 
1331 xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE); 
1332 if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) { 
1333 uint8_t this_fs, this_ls, last_ls; 
1334 grant_ref_t thisgrt; 
1335 /*  
1336 * Segment-level reason to coalesce: handling full 
1337 * pages, or adjacent sector ranges from the same page 
1338 * (and yes, this latter does happen). But not if the 
1339 * array of client pseudo-physical pages is full. 
1340 */ 
1341 this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect; 
1342 this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect; 
1343 thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref; 
1344 XENPRINTF(("xbdback_io domain %d: " 
1345 "first,last_sect[%d]=0%o,0%o\n", 
1346 xbdi->xbdi_domid, xbdi->xbdi_segno, 
1347 this_fs, this_ls)); 
1348 last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls; 
1349 xbdi->xbdi_this_fs = this_fs; 
1350 xbdi->xbdi_this_ls = this_ls; 
1351 xbdi->xbdi_thisgrt = thisgrt; 
1352 if (xbdi->xbdi_io != NULL) { 
1353 if (last_ls == VBD_MAXSECT 
1354 && this_fs == 0 
1355 && xbdi->xbdi_io->xio_nrma 
1356 < XENSHM_MAX_PAGES_PER_REQUEST) { 
1357 xbdi->xbdi_same_page = 0; 
1358 } else if (last_ls + 1 
1359 == this_fs 
1360#ifdef notyet 
1361 && (last_fas & ~PAGE_MASK) 
1362 == (this_fas & ~PAGE_MASK) 
1363#else  
1364 && 0 /* can't know frame number yet */ 
1365#endif 
1366 ) { 
1367#ifdef DEBUG 
1368 if (ratecheck(&xbdi->xbdi_lastfragio_time, 
1369 &xbdback_fragio_intvl)) 
1370 printf("%s: domain is sending" 
1371 " excessively fragmented I/O\n", 
1372 xbdi->xbdi_name); 
1373#endif 
1374 printf("xbdback_io: would maybe glue " 
1375 "same page sec %d (%d->%d)\n", 
1376 xbdi->xbdi_segno, this_fs, this_ls); 
1377 XENPRINTF(("xbdback_io domain %d: glue same " 
1378 "page", xbdi->xbdi_domid)); 
1379 panic("notyet!"); 
1380 xbdi->xbdi_same_page = 1; 
1381 } else { 
1382 KASSERT(xbdi->xbdi_io->xio_operation == 
1383 BLKIF_OP_READ || 
1384 xbdi->xbdi_io->xio_operation == 
1385 BLKIF_OP_WRITE); 
1386 xbdi->xbdi_cont_aux = xbdback_co_io_loop; 
1387 xbdi->xbdi_cont = xbdback_co_map_io; 
1388 return xbdi; 
1389 } 
1390 } else 
1391 xbdi->xbdi_same_page = 0; 
1392 
1393 if (xbdi->xbdi_io == NULL) { 
1394 xbdi->xbdi_cont = xbdback_co_io_gotio; 
1395 return xbdback_pool_get(&xbdback_io_pool, xbdi); 
1396 } else { 
1397 xbdi->xbdi_cont = xbdback_co_io_gotio2; 
1398 } 
1399 } else { 
1400 /* done with the loop over segments; get next request */ 
1401 xbdi->xbdi_cont = xbdback_co_main_incr; 
1402 } 
1403 return xbdi; 
1404} 
1405 
1406/* Prepare an I/O buffer for a xbdback instance */ 1164/* Prepare an I/O buffer for a xbdback instance */
1407static void * 1165static void *
1408xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj) 1166xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)
1409{ 1167{
1410 struct xbdback_io *xbd_io; 1168 struct xbdback_io *xbd_io;
1411 vaddr_t start_offset; /* start offset in vm area */ 
1412 int buf_flags; 1169 int buf_flags;
 1170 size_t bcount;
 1171 blkif_request_t *req;
1413 1172
1414 xbdi_get(xbdi); 1173 xbdi_get(xbdi);
1415 atomic_inc_uint(&xbdi->xbdi_pendingreqs); 1174 atomic_inc_uint(&xbdi->xbdi_pendingreqs);
1416  1175
 1176 req = &xbdi->xbdi_xen_req;
1417 xbd_io = xbdi->xbdi_io = obj; 1177 xbd_io = xbdi->xbdi_io = obj;
 1178 memset(xbd_io, 0, sizeof(*xbd_io));
1418 buf_init(&xbd_io->xio_buf); 1179 buf_init(&xbd_io->xio_buf);
1419 xbd_io->xio_xbdi = xbdi; 1180 xbd_io->xio_xbdi = xbdi;
1420 SLIST_INIT(&xbd_io->xio_rq); 1181 xbd_io->xio_operation = req->operation;
1421 xbd_io->xio_nrma = 0; 1182 xbd_io->xio_id = req->id;
1422 xbd_io->xio_mapped = 0; 
1423 xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; 
1424 1183
1425 start_offset = xbdi->xbdi_this_fs * VBD_BSIZE; 1184 /* Process segments */
1426 KASSERT(start_offset < PAGE_SIZE); 1185 bcount = 0;
 1186 for (int i = 0; i < req->nr_segments; i++) {
 1187 xbd_io->xio_gref[i] = req->seg[i].gref;
 1188 bcount += (req->seg[i].last_sect - req->seg[i].first_sect + 1)
 1189 * VBD_BSIZE;
 1190 }
 1191 KASSERT(bcount <= MAXPHYS);
 1192 xbd_io->xio_nrma = req->nr_segments;
 1193
 1194 xbd_io->xio_start_offset = req->seg[0].first_sect * VBD_BSIZE;
 1195 KASSERT(xbd_io->xio_start_offset < PAGE_SIZE);
 1196 KASSERT(bcount + xbd_io->xio_start_offset < VBD_VA_SIZE);
1427 1197
1428 if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) { 1198 if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
1429 buf_flags = B_WRITE; 1199 buf_flags = B_WRITE;
1430 } else { 1200 } else {
1431 buf_flags = B_READ; 1201 buf_flags = B_READ;
1432 } 1202 }
1433 1203
1434 xbd_io->xio_buf.b_flags = buf_flags; 1204 xbd_io->xio_buf.b_flags = buf_flags;
1435 xbd_io->xio_buf.b_cflags = 0; 1205 xbd_io->xio_buf.b_cflags = 0;
1436 xbd_io->xio_buf.b_oflags = 0; 1206 xbd_io->xio_buf.b_oflags = 0;
1437 xbd_io->xio_buf.b_iodone = xbdback_iodone; 1207 xbd_io->xio_buf.b_iodone = xbdback_iodone;
1438 xbd_io->xio_buf.b_proc = NULL; 1208 xbd_io->xio_buf.b_proc = NULL;
1439 xbd_io->xio_buf.b_vp = xbdi->xbdi_vp; 1209 xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
1440 xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock; 1210 xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock;
1441 xbd_io->xio_buf.b_dev = xbdi->xbdi_dev; 1211 xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
1442 xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector; 1212 xbd_io->xio_buf.b_blkno = req->sector_number;
1443 xbd_io->xio_buf.b_bcount = 0; 1213 xbd_io->xio_buf.b_bcount = bcount;
1444 xbd_io->xio_buf.b_data = (void *)start_offset; 1214 xbd_io->xio_buf.b_data = NULL;
1445 xbd_io->xio_buf.b_private = xbd_io; 1215 xbd_io->xio_buf.b_private = xbd_io;
1446 1216
1447 xbdi->xbdi_cont = xbdback_co_io_gotio2; 
1448 return xbdi; 
1449} 
1450 
1451/* Manage fragments */ 
1452static void * 
1453xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj) 
1454{ 
1455 (void)obj; 
1456 if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) { 
1457 /* if this is the first segment of a new request */ 
1458 /* or if it's the first segment of the io */ 
1459 xbdi->xbdi_cont = xbdback_co_io_gotfrag; 
1460 return xbdback_pool_get(&xbdback_fragment_pool, xbdi); 
1461 } 
1462 xbdi->xbdi_cont = xbdback_co_io_gotfrag2; 
1463 return xbdi; 
1464} 
1465 
1466/* Prepare the instance for its first fragment */ 
1467static void * 
1468xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj) 
1469{ 
1470 struct xbdback_fragment *xbd_fr; 
1471 
1472 xbd_fr = obj; 
1473 xbd_fr->car = xbdi->xbdi_req; 
1474 SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr); 
1475 ++xbdi->xbdi_req->rq_iocount; 
1476 
1477 xbdi->xbdi_cont = xbdback_co_io_gotfrag2; 
1478 return xbdi; 
1479} 
1480 
1481/* Last routine to manage segments fragments for one I/O */ 
1482static void * 
1483xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj) 
1484{ 
1485 struct xbdback_io *xbd_io; 
1486 int seg_size; 
1487 uint8_t this_fs, this_ls; 
1488 
1489 this_fs = xbdi->xbdi_this_fs; 
1490 this_ls = xbdi->xbdi_this_ls; 
1491 xbd_io = xbdi->xbdi_io; 
1492 seg_size = this_ls - this_fs + 1; 
1493 
1494 if (seg_size < 0) { 
1495 if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) { 
1496 printf("xbdback_io domain %d: negative-size request " 
1497 "(%d %d)\n", 
1498 xbdi->xbdi_domid, this_ls, this_fs); 
1499 } 
1500 xbdback_io_error(xbdi->xbdi_io, EINVAL); 
1501 xbdi->xbdi_io = NULL; 
1502 xbdi->xbdi_cont = xbdback_co_main_incr; 
1503 return xbdi; 
1504 } 
1505  
1506 if (!xbdi->xbdi_same_page) { 
1507 XENPRINTF(("xbdback_io domain %d: appending grant %u\n", 
1508 xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt)); 
1509 xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt; 
1510 } 
1511 
1512 xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE); 
1513 XENPRINTF(("xbdback_io domain %d: start sect %ld size %d\n", 
1514 xbdi->xbdi_domid, (long)xbdi->xbdi_next_sector, seg_size)); 
1515  
1516 /* Finally, the end of the segment loop! */ 
1517 xbdi->xbdi_next_sector += seg_size; 
1518 ++xbdi->xbdi_segno; 
1519 xbdi->xbdi_cont = xbdback_co_io_loop; 
1520 return xbdi; 
1521} 
1522 
1523/* 
1524 * Map the different I/O requests in backend's VA space. 
1525 */ 
1526static void * 
1527xbdback_co_map_io(struct xbdback_instance *xbdi, void *obj) 
1528{ 
1529 (void)obj; 
1530 XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n", 
1531 xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno, 
1532 (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io)); 
1533 xbdi->xbdi_cont = xbdback_co_do_io; 1217 xbdi->xbdi_cont = xbdback_co_do_io;
1534 return xbdback_map_shm(xbdi->xbdi_io); 1218 return xbdback_map_shm(xbdi->xbdi_io);
1535} 1219}
1536 1220
1537static void 1221static void
1538xbdback_io_error(struct xbdback_io *xbd_io, int error) 1222xbdback_io_error(struct xbdback_io *xbd_io, int error)
1539{ 1223{
1540 xbd_io->xio_buf.b_error = error; 1224 xbd_io->xio_buf.b_error = error;
1541 xbdback_iodone(&xbd_io->xio_buf); 1225 xbdback_iodone(&xbd_io->xio_buf);
1542} 1226}
1543 1227
1544/* 1228/*
1545 * Main xbdback I/O routine. It can either perform a flush operation or 1229 * Main xbdback I/O routine. It can either perform a flush operation or
1546 * schedule a read/write operation. 1230 * schedule a read/write operation.
1547 */ 1231 */
1548static void * 1232static void *
1549xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj) 1233xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj)
1550{ 1234{
1551 struct xbdback_io *xbd_io = xbdi->xbdi_io; 1235 struct xbdback_io *xbd_io = xbdi->xbdi_io;
1552 vaddr_t start_offset; 
1553 int nsegs __diagused; 1236 int nsegs __diagused;
1554 1237
1555 switch (xbd_io->xio_operation) { 1238 switch (xbd_io->xio_operation) {
1556 case BLKIF_OP_FLUSH_DISKCACHE: 1239 case BLKIF_OP_FLUSH_DISKCACHE:
1557 { 1240 {
1558 int error; 1241 int error;
1559 int force = 1; 1242 int force = 1;
1560 1243
1561 error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE, 1244 error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE,
1562 kauth_cred_get()); 1245 kauth_cred_get());
1563 if (error) { 1246 if (error) {
1564 aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n", 1247 aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n",
1565 xbdi->xbdi_xbusd->xbusd_path, error); 1248 xbdi->xbdi_xbusd->xbusd_path, error);
1566 if (error == EOPNOTSUPP || error == ENOTTY) 1249 if (error == EOPNOTSUPP || error == ENOTTY)
1567 error = BLKIF_RSP_EOPNOTSUPP; 1250 error = BLKIF_RSP_EOPNOTSUPP;
1568 else 1251 else
1569 error = BLKIF_RSP_ERROR; 1252 error = BLKIF_RSP_ERROR;
1570 } else 1253 } else
1571 error = BLKIF_RSP_OKAY; 1254 error = BLKIF_RSP_OKAY;
1572 xbdback_send_reply(xbdi, xbd_io->xio_flush_id, 1255 xbdback_send_reply(xbdi, xbd_io->xio_id,
1573 xbd_io->xio_operation, error); 1256 xbd_io->xio_operation, error);
1574 xbdback_pool_put(&xbdback_io_pool, xbd_io); 1257 xbdback_pool_put(&xbdback_io_pool, xbd_io);
1575 xbdi_put(xbdi); 1258 xbdi_put(xbdi);
1576 xbdi->xbdi_io = NULL; 1259 xbdi->xbdi_io = NULL;
1577 xbdi->xbdi_cont = xbdback_co_main_incr; 1260 xbdi->xbdi_cont = xbdback_co_main_incr;
1578 return xbdi; 1261 return xbdi;
1579 } 1262 }
1580 case BLKIF_OP_READ: 1263 case BLKIF_OP_READ:
1581 case BLKIF_OP_WRITE: 1264 case BLKIF_OP_WRITE:
1582 start_offset = (vaddr_t)xbd_io->xio_buf.b_data; 
1583 KASSERT(xbd_io->xio_buf.b_bcount + start_offset < VBD_VA_SIZE); 
1584 xbd_io->xio_buf.b_data = (void *) 1265 xbd_io->xio_buf.b_data = (void *)
1585 (start_offset + xbd_io->xio_vaddr); 1266 (xbd_io->xio_vaddr + xbd_io->xio_start_offset);
1586#ifdef DIAGNOSTIC 1267
1587 nsegs = round_page(start_offset + xbd_io->xio_buf.b_bcount) 
1588 >> PAGE_SHIFT; 
1589 if (nsegs > xbd_io->xio_nrma) { 
1590 printf("xbdback_co_do_io: vaddr %#" PRIxVADDR 
1591 " bcount %#x doesn't fit in %d pages\n", 
1592 start_offset, xbd_io->xio_buf.b_bcount, 
1593 xbd_io->xio_nrma); 
1594 panic("xbdback_co_do_io: not enough pages"); 
1595 } 
1596#endif 
1597 if ((xbd_io->xio_buf.b_flags & B_READ) == 0) { 1268 if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
1598 mutex_enter(xbd_io->xio_buf.b_vp->v_interlock); 1269 mutex_enter(xbd_io->xio_buf.b_vp->v_interlock);
1599 xbd_io->xio_buf.b_vp->v_numoutput++; 1270 xbd_io->xio_buf.b_vp->v_numoutput++;
1600 mutex_exit(xbd_io->xio_buf.b_vp->v_interlock); 1271 mutex_exit(xbd_io->xio_buf.b_vp->v_interlock);
1601 } 1272 }
1602 bdev_strategy(&xbd_io->xio_buf); 
1603 /* will call xbdback_iodone() asynchronously when done */ 1273 /* will call xbdback_iodone() asynchronously when done */
 1274 bdev_strategy(&xbd_io->xio_buf);
1604 xbdi->xbdi_io = NULL; 1275 xbdi->xbdi_io = NULL;
1605 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1276 xbdi->xbdi_cont = xbdback_co_main_incr;
1606 return xbdi; 1277 return xbdi;
1607 default: 1278 default:
1608 /* Should never happen */ 1279 /* Should never happen */
1609 panic("xbdback_co_do_io: unsupported operation %d", 1280 panic("xbdback_co_do_io: unsupported operation %d",
1610 xbd_io->xio_operation); 1281 xbd_io->xio_operation);
1611 } 1282 }
1612} 1283}
1613 1284
1614/* 1285/*
1615 * Called from softint(9) context when an I/O is done: for each request, send 1286 * Called from softint(9) context when an I/O is done: for each request, send
1616 * back the associated reply to the domain. 1287 * back the associated reply to the domain.
1617 * 1288 *
1618 * This gets reused by xbdback_io_error to report errors from other sources. 1289 * This gets reused by xbdback_io_error to report errors from other sources.
1619 */ 1290 */
1620static void 1291static void
1621xbdback_iodone(struct buf *bp) 1292xbdback_iodone(struct buf *bp)
1622{ 1293{
1623 struct xbdback_io *xbd_io; 1294 struct xbdback_io *xbd_io;
1624 struct xbdback_instance *xbdi; 1295 struct xbdback_instance *xbdi;
1625 int errp; 1296 int status;
1626 1297
1627 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1298 KERNEL_LOCK(1, NULL); /* XXXSMP */
1628 1299
1629 xbd_io = bp->b_private; 1300 xbd_io = bp->b_private;
1630 xbdi = xbd_io->xio_xbdi; 1301 xbdi = xbd_io->xio_xbdi;
1631 1302
1632 XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n", 1303 XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
1633 xbdi->xbdi_domid, (long)xbd_io)); 1304 xbdi->xbdi_domid, (long)xbd_io));
1634 1305
1635 if (xbd_io->xio_mapped == 1) 1306 KASSERT(bp->b_error != 0 || xbd_io->xio_xv != NULL);
 1307 if (xbd_io->xio_xv != NULL)
1636 xbdback_unmap_shm(xbd_io); 1308 xbdback_unmap_shm(xbd_io);
1637 1309
1638 if (bp->b_error != 0) { 1310 if (bp->b_error != 0) {
1639 printf("xbd IO domain %d: error %d\n", 1311 printf("xbd IO domain %d: error %d\n",
1640 xbdi->xbdi_domid, bp->b_error); 1312 xbdi->xbdi_domid, bp->b_error);
1641 errp = 1; 1313 status = BLKIF_RSP_ERROR;
1642 } else 1314 } else
1643 errp = 0; 1315 status = BLKIF_RSP_OKAY;
1644  1316
1645 /* for each constituent xbd request */ 1317 xbdback_send_reply(xbdi, xbd_io->xio_id, xbd_io->xio_operation, status);
1646 while(!SLIST_EMPTY(&xbd_io->xio_rq)) { 1318
1647 struct xbdback_fragment *xbd_fr; 
1648 struct xbdback_request *xbd_req; 
1649 struct xbdback_instance *rxbdi __diagused; 
1650 int error; 
1651  
1652 xbd_fr = SLIST_FIRST(&xbd_io->xio_rq); 
1653 xbd_req = xbd_fr->car; 
1654 SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr); 
1655 xbdback_pool_put(&xbdback_fragment_pool, xbd_fr); 
1656  
1657 if (errp) 
1658 ++xbd_req->rq_ioerrs; 
1659  
1660 /* finalize it only if this was its last I/O */ 
1661 if (--xbd_req->rq_iocount > 0) 
1662 continue; 
1663 
1664 rxbdi = xbd_req->rq_xbdi; 
1665 KASSERT(xbdi == rxbdi); 
1666  
1667 error = xbd_req->rq_ioerrs > 0 
1668 ? BLKIF_RSP_ERROR 
1669 : BLKIF_RSP_OKAY; 
1670 
1671 XENPRINTF(("xbdback_io domain %d: end request %"PRIu64 
1672 " error=%d\n", 
1673 xbdi->xbdi_domid, xbd_req->rq_id, error)); 
1674 xbdback_send_reply(xbdi, xbd_req->rq_id, 
1675 xbd_req->rq_operation, error); 
1676 xbdback_pool_put(&xbdback_request_pool, xbd_req); 
1677 } 
1678 xbdi_put(xbdi); 1319 xbdi_put(xbdi);
1679 atomic_dec_uint(&xbdi->xbdi_pendingreqs); 1320 atomic_dec_uint(&xbdi->xbdi_pendingreqs);
1680 buf_destroy(&xbd_io->xio_buf); 1321 buf_destroy(&xbd_io->xio_buf);
1681 xbdback_pool_put(&xbdback_io_pool, xbd_io); 1322 xbdback_pool_put(&xbdback_io_pool, xbd_io);
1682 1323
1683 xbdback_wakeup_thread(xbdi); 1324 xbdback_wakeup_thread(xbdi);
1684 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1325 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1685} 1326}
1686 1327
1687/* 1328/*
1688 * Wake up the per xbdback instance thread. 1329 * Wake up the per xbdback instance thread.
1689 */ 1330 */
1690static void 1331static void
1691xbdback_wakeup_thread(struct xbdback_instance *xbdi) 1332xbdback_wakeup_thread(struct xbdback_instance *xbdi)
1692{ 1333{
1693 1334
1694 mutex_enter(&xbdi->xbdi_lock); 1335 mutex_enter(&xbdi->xbdi_lock);
1695 /* only set RUN state when we are WAITING for work */ 1336 /* only set RUN state when we are WAITING for work */
1696 if (xbdi->xbdi_status == WAITING) 1337 if (xbdi->xbdi_status == WAITING)
1697 xbdi->xbdi_status = RUN; 1338 xbdi->xbdi_status = RUN;
1698 cv_broadcast(&xbdi->xbdi_cv); 1339 cv_broadcast(&xbdi->xbdi_cv);
1699 mutex_exit(&xbdi->xbdi_lock); 1340 mutex_exit(&xbdi->xbdi_lock);
1700} 1341}
1701 1342
1702/* 1343/*
1703 * called once a request has completed. Place the reply in the ring and 1344 * called once a request has completed. Place the reply in the ring and
1704 * notify the guest OS. 1345 * notify the guest OS.
1705 */ 1346 */
1706static void 1347static void
1707xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id, 1348xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id,
1708 int op, int status) 1349 int op, int status)
1709{ 1350{
1710 blkif_response_t *resp_n; 1351 blkif_response_t *resp_n;
1711 blkif_x86_32_response_t *resp32; 1352 blkif_x86_32_response_t *resp32;
1712 blkif_x86_64_response_t *resp64; 1353 blkif_x86_64_response_t *resp64;
1713 int notify; 1354 int notify;
1714 1355
1715 /* 1356 /*
1716 * The ring can be accessed by the xbdback thread, xbdback_iodone() 1357 * The ring can be accessed by the xbdback thread, xbdback_iodone()
1717 * handler, or any handler that triggered the shm callback. So 1358 * handler, or any handler that triggered the shm callback. So
1718 * protect ring access via the xbdi_lock mutex. 1359 * protect ring access via the xbdi_lock mutex.
1719 */ 1360 */
1720 mutex_enter(&xbdi->xbdi_lock); 1361 mutex_enter(&xbdi->xbdi_lock);
1721 switch (xbdi->xbdi_proto) { 1362 switch (xbdi->xbdi_proto) {
1722 case XBDIP_NATIVE: 1363 case XBDIP_NATIVE:
1723 resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n, 1364 resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n,
1724 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1365 xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1725 resp_n->id = id; 1366 resp_n->id = id;
1726 resp_n->operation = op; 1367 resp_n->operation = op;
1727 resp_n->status = status; 1368 resp_n->status = status;
1728 break; 1369 break;
1729 case XBDIP_32: 1370 case XBDIP_32:
1730 resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32, 1371 resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32,
1731 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1372 xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1732 resp32->id = id; 1373 resp32->id = id;
1733 resp32->operation = op; 1374 resp32->operation = op;
1734 resp32->status = status; 1375 resp32->status = status;
1735 break; 1376 break;
1736 case XBDIP_64: 1377 case XBDIP_64:
1737 resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64, 1378 resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64,
1738 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1379 xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1739 resp64->id = id; 1380 resp64->id = id;
1740 resp64->operation = op; 1381 resp64->operation = op;
1741 resp64->status = status; 1382 resp64->status = status;
1742 break; 1383 break;
1743 } 1384 }
1744 xbdi->xbdi_ring.ring_n.rsp_prod_pvt++; 1385 xbdi->xbdi_ring.ring_n.rsp_prod_pvt++;
1745 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify); 1386 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify);
1746 mutex_exit(&xbdi->xbdi_lock); 1387 mutex_exit(&xbdi->xbdi_lock);
1747 1388
1748 if (notify) { 1389 if (notify) {
1749 XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid)); 1390 XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid));
1750 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn); 1391 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
1751 } 1392 }
1752} 1393}
1753 1394
1754/* 1395/*
1755 * Map multiple entries of an I/O request into backend's VA space. 1396 * Map multiple entries of an I/O request into backend's VA space.
1756 * The xbd_io->xio_gref array has to be filled out by the caller. 1397 * The xbd_io->xio_gref array has to be filled out by the caller.
1757 */ 1398 */
1758static void * 1399static void *
1759xbdback_map_shm(struct xbdback_io *xbd_io) 1400xbdback_map_shm(struct xbdback_io *xbd_io)
1760{ 1401{
1761 struct xbdback_instance *xbdi = xbd_io->xio_xbdi; 1402 struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
1762 struct xbdback_request *xbd_rq; 
1763 int error, s; 1403 int error, s;
1764 1404
1765#ifdef XENDEBUG_VBD 1405#ifdef XENDEBUG_VBD
1766 int i; 1406 int i;
1767 printf("xbdback_map_shm map grant "); 1407 printf("xbdback_map_shm map grant ");
1768 for (i = 0; i < xbd_io->xio_nrma; i++) { 1408 for (i = 0; i < xbd_io->xio_nrma; i++) {
1769 printf("%u ", (u_int)xbd_io->xio_gref[i]); 1409 printf("%u ", (u_int)xbd_io->xio_gref[i]);
1770 } 1410 }
1771#endif 1411#endif
1772 1412
1773 KASSERT(xbd_io->xio_mapped == 0); 
1774 
1775 s = splvm(); /* XXXSMP */ 1413 s = splvm(); /* XXXSMP */
1776 xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car; 
1777 
1778 xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free); 1414 xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free);
1779 KASSERT(xbd_io->xio_xv != NULL); 1415 KASSERT(xbd_io->xio_xv != NULL);
1780 SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next); 1416 SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next);
1781 xbd_io->xio_vaddr = xbd_io->xio_xv->xv_vaddr; 1417 xbd_io->xio_vaddr = xbd_io->xio_xv->xv_vaddr;
1782 splx(s); 1418 splx(s);
1783 1419
1784 error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid, 1420 error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid,
1785 xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh,  1421 xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh,
1786 (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0); 1422 (xbd_io->xio_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0);
1787 1423
1788 switch(error) { 1424 switch(error) {
1789 case 0: 1425 case 0:
1790#ifdef XENDEBUG_VBD 1426#ifdef XENDEBUG_VBD
1791 printf("handle "); 1427 printf("handle ");
1792 for (i = 0; i < xbd_io->xio_nrma; i++) { 1428 for (i = 0; i < xbd_io->xio_nrma; i++) {
1793 printf("%u ", (u_int)xbd_io->xio_gh[i]); 1429 printf("%u ", (u_int)xbd_io->xio_gh[i]);
1794 } 1430 }
1795 printf("\n"); 1431 printf("\n");
1796#endif 1432#endif
1797 xbd_io->xio_mapped = 1; 
1798 return xbdi; 1433 return xbdi;
1799 default: 1434 default:
1800 if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) { 1435 if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) {
1801 printf("xbdback_map_shm: xen_shm error %d ", error); 1436 printf("xbdback_map_shm: xen_shm error %d ", error);
1802 } 1437 }
1803 xbdback_io_error(xbdi->xbdi_io, error); 1438 xbdback_io_error(xbdi->xbdi_io, error);
1804 SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next); 1439 SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
1805 xbd_io->xio_xv = NULL; 1440 xbd_io->xio_xv = NULL;
1806 xbdi->xbdi_io = NULL; 1441 xbdi->xbdi_io = NULL;
1807 // do not retry 1442 // do not retry
1808 xbdi->xbdi_cont = xbdback_co_main_incr; 1443 xbdi->xbdi_cont = xbdback_co_main_incr;
1809 return xbdi; 1444 return xbdi;
1810 } 1445 }
1811} 1446}
1812 1447
1813/* unmap a request from our virtual address space (request is done) */ 1448/* unmap a request from our virtual address space (request is done) */
1814static void 1449static void
1815xbdback_unmap_shm(struct xbdback_io *xbd_io) 1450xbdback_unmap_shm(struct xbdback_io *xbd_io)
1816{ 1451{
1817 struct xbdback_instance *xbdi = xbd_io->xio_xbdi; 1452 struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
1818 1453
1819#ifdef XENDEBUG_VBD 1454#ifdef XENDEBUG_VBD
1820 int i; 1455 int i;
1821 printf("xbdback_unmap_shm handle "); 1456 printf("xbdback_unmap_shm handle ");
1822 for (i = 0; i < xbd_io->xio_nrma; i++) { 1457 for (i = 0; i < xbd_io->xio_nrma; i++) {
1823 printf("%u ", (u_int)xbd_io->xio_gh[i]); 1458 printf("%u ", (u_int)xbd_io->xio_gh[i]);
1824 } 1459 }
1825 printf("\n"); 1460 printf("\n");
1826#endif 1461#endif
1827 1462
1828 KASSERT(xbd_io->xio_mapped == 1); 1463 KASSERT(xbd_io->xio_xv != NULL);
1829 xbd_io->xio_mapped = 0; 
1830 xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma, 1464 xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma,
1831 xbd_io->xio_gh); 1465 xbd_io->xio_gh);
1832 SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next); 1466 SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
1833 xbd_io->xio_xv = NULL; 1467 xbd_io->xio_xv = NULL;
1834 xbd_io->xio_vaddr = -1; 1468 xbd_io->xio_vaddr = -1;
1835} 1469}
1836 1470
1837/* Obtain memory from a pool */ 1471/* Obtain memory from a pool */
1838static void * 1472static void *
1839xbdback_pool_get(struct xbdback_pool *pp, 1473xbdback_pool_get(struct pool_cache *pc,
1840 struct xbdback_instance *xbdi) 1474 struct xbdback_instance *xbdi)
1841{ 1475{
1842 return pool_cache_get(&pp->pc, PR_WAITOK); 1476 return pool_cache_get(pc, PR_WAITOK);
1843} 1477}
1844 1478
1845/* Restore memory to a pool */ 1479/* Restore memory to a pool */
1846static void 1480static void
1847xbdback_pool_put(struct xbdback_pool *pp, void *item) 1481xbdback_pool_put(struct pool_cache *pc, void *item)
1848{ 1482{
1849 pool_cache_put(&pp->pc, item); 1483 pool_cache_put(pc, item);
1850} 1484}
1851 1485
1852/* 1486/*
1853 * Trampoline routine. Calls continuations in a loop and only exits when 1487 * Trampoline routine. Calls continuations in a loop and only exits when
1854 * either the returned object or the next callback is NULL. 1488 * either the returned object or the next callback is NULL.
1855 */ 1489 */
1856static void 1490static void
1857xbdback_trampoline(struct xbdback_instance *xbdi, void *obj) 1491xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
1858{ 1492{
1859 xbdback_cont_t cont; 1493 xbdback_cont_t cont;
1860 1494
1861 while(obj != NULL && xbdi->xbdi_cont != NULL) { 1495 while(obj != NULL && xbdi->xbdi_cont != NULL) {
1862 cont = xbdi->xbdi_cont; 1496 cont = xbdi->xbdi_cont;
1863#ifdef DIAGNOSTIC 1497#ifdef DIAGNOSTIC
1864 xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF; 1498 xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF;
1865#endif 1499#endif
1866 obj = (*cont)(xbdi, obj); 1500 obj = (*cont)(xbdi, obj);
1867#ifdef DIAGNOSTIC 1501#ifdef DIAGNOSTIC
1868 if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) { 1502 if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) {
1869 printf("xbdback_trampoline: 0x%lx didn't set " 1503 printf("xbdback_trampoline: 0x%lx didn't set "
1870 "xbdi->xbdi_cont!\n", (long)cont); 1504 "xbdi->xbdi_cont!\n", (long)cont);
1871 panic("xbdback_trampoline: bad continuation"); 1505 panic("xbdback_trampoline: bad continuation");
1872 } 1506 }
1873#endif 1507#endif
1874 } 1508 }
1875} 1509}