Mon Apr 20 16:12:28 2020 UTC ()
remove I/O defragmentation logic, pass requests straight to the
underlying block device without trying to coalesce them

it seems rarely useful, and it makes the handling logic unnecessarily complex -
ultimately it's the DomU operating system responsibility to issue optimal I/O

might also help with the ZFS problem reported on port-xen, and will surely
simplify eventual indirect segment support


(jdolecek)
diff -r1.82 -r1.83 src/sys/arch/xen/xen/xbdback_xenbus.c

cvs diff -r1.82 -r1.83 src/sys/arch/xen/xen/xbdback_xenbus.c (expand / switch to context diff)
--- src/sys/arch/xen/xen/xbdback_xenbus.c 2020/04/20 14:11:04 1.82
+++ src/sys/arch/xen/xen/xbdback_xenbus.c 2020/04/20 16:12:28 1.83
@@ -1,4 +1,4 @@
-/*      $NetBSD: xbdback_xenbus.c,v 1.82 2020/04/20 14:11:04 jdolecek Exp $      */
+/*      $NetBSD: xbdback_xenbus.c,v 1.83 2020/04/20 16:12:28 jdolecek Exp $      */
 
 /*
  * Copyright (c) 2006 Manuel Bouyer.
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.82 2020/04/20 14:11:04 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.83 2020/04/20 16:12:28 jdolecek Exp $");
 
 #include <sys/atomic.h>
 #include <sys/buf.h>
@@ -74,9 +74,7 @@
 /* Need to alloc one extra page to account for possible mapping offset */
 #define VBD_VA_SIZE	(MAXPHYS + PAGE_SIZE)
 
-struct xbdback_request;
 struct xbdback_io;
-struct xbdback_fragment;
 struct xbdback_instance;
 
 /*
@@ -98,8 +96,8 @@
  * callback), the return value of a continuation can be set to NULL. In that
  * case, the thread will go back to sleeping and wait for the proper
  * condition before it starts processing requests again from where it left.
- * Continuation state is "stored" in the xbdback instance (xbdi_cont and
- * xbdi_cont_aux), and should only be manipulated by the instance thread.
+ * Continuation state is "stored" in the xbdback instance (xbdi_cont),
+ * and should only be manipulated by the instance thread.
  *
  * As xbdback(4) has to handle different sort of asynchronous events (Xen
  * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock
@@ -109,37 +107,24 @@
  * Here's how the call graph is supposed to be for a single I/O:
  *
  * xbdback_co_main()
- *        |
- *        |               --> xbdback_co_cache_doflush() or NULL
- *        |               |
- *        |               - xbdback_co_cache_flush2() <- xbdback_co_do_io() <-
- *        |                                            |                     |
- *        |               |-> xbdback_co_cache_flush() -> xbdback_co_map_io()-
+ *        |               --> xbdback_co_cache_flush()
+ *        |               |    |
+ *        |               |    -> xbdback_co_cache_doflush() or NULL
+ *        |               |        |
+ *        |               |        -> xbdback_co_do_io()
  * xbdback_co_main_loop()-|
- *        |               |-> xbdback_co_main_done() ---> xbdback_co_map_io()-
- *        |                                           |                      |
- *        |               -- xbdback_co_main_done2() <-- xbdback_co_do_io() <-
+ *        |               |-> xbdback_co_main_done2() or NULL
  *        |               |
- *        |               --> xbdback_co_main() or NULL
+ *        |               --> xbdback_co_main_incr() -> xbdback_co_main_loop()
  *        |
  *     xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
  *        |
- *     xbdback_co_io_gotreq()--+--> xbdback_co_map_io() ---
- *        |                    |                          |
- *  -> xbdback_co_io_loop()----|  <- xbdback_co_do_io() <--
- *  |     |     |     |
- *  |     |     |     |----------> xbdback_co_io_gotio()
- *  |     |     |                         |
- *  |     |   xbdback_co_main_incr()      |
- *  |     |     |                         |
- *  |     |   xbdback_co_main_loop()      |
- *  |     |                               |
- *  |  xbdback_co_io_gotio2() <-----------|
- *  |     |           |
- *  |     |           |----------> xbdback_co_io_gotfrag()
- *  |     |                               |
- *  -- xbdback_co_io_gotfrag2() <---------|
+ *     xbdback_co_io_gotio() -> xbdback_map_shm()
+ *        |                     |
+ *        |                     xbdback_co_main_incr() -> xbdback_co_main_loop()
  *        |
+ *     xbdback_co_do_io() 
+ *        |
  *     xbdback_co_main_incr() -> xbdback_co_main_loop()
  */
 typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
@@ -192,17 +177,12 @@
 	 * continuation-ness (see above).
 	 */
 	RING_IDX xbdi_req_prod; /* limit on request indices */
-	xbdback_cont_t xbdi_cont, xbdi_cont_aux;
+	xbdback_cont_t xbdi_cont;
 	/* _request state: track requests fetched from ring */
 	struct xbdback_request *xbdi_req; /* if NULL, ignore following */
 	blkif_request_t xbdi_xen_req;
-	int xbdi_segno;
 	/* _io state: I/O associated to this instance */
-	struct xbdback_io *xbdi_io; /* if NULL, ignore next field */
-	daddr_t xbdi_next_sector;
-	uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */
-	uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */
-	grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */
+	struct xbdback_io *xbdi_io;
 	/* other state */
 	int xbdi_same_page; /* are we merging two segments on the same page? */
 	uint xbdi_pendingreqs; /* number of I/O in fly */
@@ -223,20 +203,6 @@
 static kmutex_t xbdback_lock;
 
 /*
- * For each request from a guest, a xbdback_request is allocated from
- * a pool.  This will describe the request until completion.  The
- * request may require multiple IO operations to perform, so the
- * per-IO information is not stored here.
- */
-struct xbdback_request {
-	struct xbdback_instance *rq_xbdi; /* our xbd instance */
-	uint64_t rq_id;
-	int rq_iocount; /* reference count; or, number of outstanding I/O's */
-	int rq_ioerrs;
-	uint8_t rq_operation;
-};
-
-/*
  * For each I/O operation associated with one of those requests, an
  * xbdback_io is allocated from a pool.  It may correspond to multiple
  * Xen disk requests, or parts of them, if several arrive at once that
@@ -246,56 +212,35 @@
 	/* The instance pointer is duplicated for convenience. */
 	struct xbdback_instance *xio_xbdi; /* our xbd instance */
 	uint8_t xio_operation;
+	uint64_t xio_id;
 	union {
 		struct {
 			struct buf xio_buf; /* our I/O */
-			/* xbd requests involved */
-			SLIST_HEAD(, xbdback_fragment) xio_rq;
 			/* the virtual address to map the request at */
 			vaddr_t xio_vaddr;
 			struct xbdback_va *xio_xv;
+			vaddr_t xio_start_offset;	/* I/O start offset */
 			/* grants to map */
 			grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST];
 			/* grants release */
 			grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];
 			uint16_t xio_nrma; /* number of guest pages */
-			uint16_t xio_mapped; /* == 1: grants are mapped */
 		} xio_rw;
-		uint64_t xio_flush_id;
 	} u;
 };
 #define xio_buf		u.xio_rw.xio_buf
-#define xio_rq		u.xio_rw.xio_rq
 #define xio_vaddr	u.xio_rw.xio_vaddr
+#define xio_start_offset	u.xio_rw.xio_start_offset
 #define xio_xv		u.xio_rw.xio_xv
 #define xio_gref	u.xio_rw.xio_gref
 #define xio_gh		u.xio_rw.xio_gh
 #define xio_nrma	u.xio_rw.xio_nrma
-#define xio_mapped	u.xio_rw.xio_mapped
 
-#define xio_flush_id	u.xio_flush_id
-
 /*
- * Rather than having the xbdback_io keep an array of the
- * xbdback_requests involved, since the actual number will probably be
- * small but might be as large as BLKIF_RING_SIZE, use a list.  This
- * would be threaded through xbdback_request, but one of them might be
- * part of multiple I/O's, alas.
- */
-struct xbdback_fragment {
-	struct xbdback_request *car;
-	SLIST_ENTRY(xbdback_fragment) cdr;
-};
-
-/*
  * Pools to manage the chain of block requests and I/Os fragments
  * submitted by frontend.
  */
-/* XXXSMP */
-static struct xbdback_pool {
-	struct pool_cache pc;
-	struct timeval last_warning;
-} xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool;
+static struct pool_cache xbdback_io_pool;
 
 /* Interval between reports of I/O errors from frontend */
 static const struct timeval xbdback_err_intvl = { 1, 0 };
@@ -320,22 +265,14 @@
 static void *xbdback_co_main(struct xbdback_instance *, void *);
 static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
 static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
-static void *xbdback_co_main_done(struct xbdback_instance *, void *);
 static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
 
 static void *xbdback_co_cache_flush(struct xbdback_instance *, void *);
-static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *);
 static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *);
 
 static void *xbdback_co_io(struct xbdback_instance *, void *);
-static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *);
-static void *xbdback_co_io_loop(struct xbdback_instance *, void *);
 static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
-static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *);
-static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *);
-static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *);
 
-static void *xbdback_co_map_io(struct xbdback_instance *, void *);
 static void *xbdback_co_do_io(struct xbdback_instance *, void *);
 
 static void xbdback_io_error(struct xbdback_io *, int);
@@ -345,9 +282,9 @@
 static void *xbdback_map_shm(struct xbdback_io *);
 static void xbdback_unmap_shm(struct xbdback_io *);
 
-static void *xbdback_pool_get(struct xbdback_pool *,
+static void *xbdback_pool_get(struct pool_cache *,
 			      struct xbdback_instance *);
-static void xbdback_pool_put(struct xbdback_pool *, void *);
+static void xbdback_pool_put(struct pool_cache *, void *);
 static void xbdback_thread(void *);
 static void xbdback_wakeup_thread(struct xbdback_instance *);
 static void xbdback_trampoline(struct xbdback_instance *, void *);
@@ -369,21 +306,12 @@
 	SLIST_INIT(&xbdback_instances);
 	mutex_init(&xbdback_lock, MUTEX_DEFAULT, IPL_NONE);
 
-	pool_cache_bootstrap(&xbdback_request_pool.pc,
-	    sizeof(struct xbdback_request), 0, 0, 0, "xbbrp", NULL,
-	    IPL_SOFTBIO, NULL, NULL, NULL);
-	pool_cache_bootstrap(&xbdback_io_pool.pc,
+	pool_cache_bootstrap(&xbdback_io_pool,
 	    sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL,
 	    IPL_SOFTBIO, NULL, NULL, NULL);
-	pool_cache_bootstrap(&xbdback_fragment_pool.pc,
-	    sizeof(struct xbdback_fragment), 0, 0, 0, "xbbfp", NULL,
-	    IPL_SOFTBIO, NULL, NULL, NULL);
 
 	/* we allocate enough to handle a whole ring at once */
-	pool_prime(&xbdback_request_pool.pc.pc_pool, BLKIF_RING_SIZE);
-	pool_prime(&xbdback_io_pool.pc.pc_pool, BLKIF_RING_SIZE);
-	pool_prime(&xbdback_fragment_pool.pc.pc_pool,
-            BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE);
+	pool_prime(&xbdback_io_pool.pc_pool, BLKIF_RING_SIZE);
 
 	xenbus_backend_register(&xbd_backend_driver);
 }
@@ -1074,7 +1002,8 @@
 			break;
 		}
 	} else {
-		xbdi->xbdi_cont = xbdback_co_main_done;
+		KASSERT(xbdi->xbdi_io == NULL);
+		xbdi->xbdi_cont = xbdback_co_main_done2;
 	}
 	return xbdi;
 }
@@ -1112,25 +1041,6 @@
 }
 
 /*
- * Ring processing is over. If there are any I/O still present for this
- * instance, handle them first.
- */
-static void *
-xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj)
-{
-	(void)obj;
-	if (xbdi->xbdi_io != NULL) {
-		KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ ||
-		    xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE);
-		xbdi->xbdi_cont = xbdback_co_map_io;
-		xbdi->xbdi_cont_aux = xbdback_co_main_done2;
-	} else {
-		xbdi->xbdi_cont = xbdback_co_main_done2;
-	}
-	return xbdi;
-}
-
-/*
  * Check for requests in the instance's ring. In case there are, start again
  * from the beginning. If not, stall.
  */
@@ -1139,6 +1049,7 @@
 {
 	int work_to_do;
 
+	KASSERT(xbdi->xbdio_io == NULL);
 	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
 	if (work_to_do)
 		xbdi->xbdi_cont = xbdback_co_main;
@@ -1152,29 +1063,8 @@
  * Frontend requested a cache flush operation.
  */
 static void *
-xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj)
+xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj __unused)
 {
-	(void)obj;
-
-	XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj));
-	if (xbdi->xbdi_io != NULL) {
-		/* Some I/Os are required for this instance. Process them. */
-		KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ ||
-		    xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE);
-		KASSERT(xbdi->xbdi_pendingreqs > 0);
-		xbdi->xbdi_cont = xbdback_co_map_io;
-		xbdi->xbdi_cont_aux = xbdback_co_cache_flush2;
-	} else {
-		xbdi->xbdi_cont = xbdback_co_cache_flush2;
-	}
-	return xbdi;
-}
-
-static void *
-xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj)
-{
-	(void)obj;
-	XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj));
 	if (xbdi->xbdi_pendingreqs > 0) {
 		/*
 		 * There are pending requests.
@@ -1198,7 +1088,7 @@
 	xbd_io = xbdi->xbdi_io = obj;
 	xbd_io->xio_xbdi = xbdi;
 	xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
-	xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id;
+	xbd_io->xio_id = xbdi->xbdi_xen_req.id;
 	xbdi->xbdi_cont = xbdback_co_do_io;
 	return xbdi;
 }
@@ -1240,8 +1130,6 @@
 		}
 	}
 
-	xbdi->xbdi_segno = 0;
-
 	/* copy request segments */
 	switch(xbdi->xbdi_proto) {
 	case XBDIP_NATIVE:
@@ -1261,170 +1149,52 @@
 		break;
 	}
 
-	xbdi->xbdi_cont = xbdback_co_io_gotreq;
-	return xbdback_pool_get(&xbdback_request_pool, xbdi);
+	KASSERT(xbdi->xbdi_io == NULL);
+	xbdi->xbdi_cont = xbdback_co_io_gotio;
+	return xbdback_pool_get(&xbdback_io_pool, xbdi);
 
  end:
 	xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
-	    xbdi->xbdi_xen_req.operation, error);
+	    xbdi->xbdi_xen_req.operation,
+	    (error == EROFS) ? BLKIF_RSP_EOPNOTSUPP : BLKIF_RSP_ERROR);
 	xbdi->xbdi_cont = xbdback_co_main_incr;
 	return xbdi;
 }
 
-/*
- * We have fetched segment requests from the ring. In case there are already
- * I/Os prepared for this instance, we can try coalescing the requests
- * with these I/Os.
- */
-static void *
-xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj)
-{
-	struct xbdback_request *xrq;
-
-	xrq = xbdi->xbdi_req = obj;
-	
-	xrq->rq_xbdi = xbdi;
-	xrq->rq_iocount = 0;
-	xrq->rq_ioerrs = 0;
-	xrq->rq_id = xbdi->xbdi_xen_req.id;
-	xrq->rq_operation = xbdi->xbdi_xen_req.operation;
-	KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ ||
-	    xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE);
-
-	/* 
-	 * Request-level reasons not to coalesce: different device,
-	 * different op, or noncontiguous disk sectors (vs. previous
-	 * request handed to us).
-	 */
-	xbdi->xbdi_cont = xbdback_co_io_loop;
-	if (xbdi->xbdi_io != NULL) {
-		struct xbdback_request *last_req;
-		last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car;
-		XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64
-		    "; got %" PRIu64 "\n", xbdi->xbdi_domid,
-		    xbdi->xbdi_next_sector,
-		    xbdi->xbdi_xen_req.sector_number));
-		if ((xrq->rq_operation != last_req->rq_operation)
-		    || (xbdi->xbdi_xen_req.sector_number !=
-		    xbdi->xbdi_next_sector)) {
-			XENPRINTF(("xbdback_io domain %d: segment break\n",
-			    xbdi->xbdi_domid));
-			xbdi->xbdi_next_sector =
-			    xbdi->xbdi_xen_req.sector_number;
-			KASSERT(xbdi->xbdi_io->xio_operation == BLKIF_OP_READ ||
-			    xbdi->xbdi_io->xio_operation == BLKIF_OP_WRITE);
-			xbdi->xbdi_cont_aux = xbdback_co_io_loop;
-			xbdi->xbdi_cont = xbdback_co_map_io;
-		}
-	} else {
-		xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number;
-	}
-	return xbdi;
-}
-
-/* Handle coalescing of multiple segment requests into one I/O work */
-static void *
-xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj)
-{
-	(void)obj;
-	KASSERT(xbdi->xbdi_req->rq_operation == BLKIF_OP_READ ||
-	    xbdi->xbdi_req->rq_operation == BLKIF_OP_WRITE);
-	if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) {
-		uint8_t this_fs, this_ls, last_ls;
-		grant_ref_t thisgrt;
-		/* 
-		 * Segment-level reason to coalesce: handling full
-		 * pages, or adjacent sector ranges from the same page
-		 * (and yes, this latter does happen).  But not if the
-		 * array of client pseudo-physical pages is full.
-		 */
-		this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect;
-		this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect;
-		thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref;
-		XENPRINTF(("xbdback_io domain %d: "
-			   "first,last_sect[%d]=0%o,0%o\n",
-			   xbdi->xbdi_domid, xbdi->xbdi_segno,
-			   this_fs, this_ls));
-		last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls;
-		xbdi->xbdi_this_fs = this_fs;
-		xbdi->xbdi_this_ls = this_ls;
-		xbdi->xbdi_thisgrt = thisgrt;
-		if (xbdi->xbdi_io != NULL) {
-			if (last_ls == VBD_MAXSECT
-			    && this_fs == 0
-			    && xbdi->xbdi_io->xio_nrma
-			    < XENSHM_MAX_PAGES_PER_REQUEST) {
-				xbdi->xbdi_same_page = 0;
-			} else if (last_ls + 1
-				       == this_fs
-#ifdef notyet
-				   && (last_fas & ~PAGE_MASK)
-				       == (this_fas & ~PAGE_MASK)
-#else 
-				  && 0 /* can't know frame number yet */
-#endif
-			    ) {
-#ifdef DEBUG
-				if (ratecheck(&xbdi->xbdi_lastfragio_time,
-				    &xbdback_fragio_intvl))
-					printf("%s: domain is sending"
-					    " excessively fragmented I/O\n",
-					    xbdi->xbdi_name);
-#endif
-				printf("xbdback_io: would maybe glue "
-				    "same page sec %d (%d->%d)\n",
-				    xbdi->xbdi_segno, this_fs, this_ls);
-				XENPRINTF(("xbdback_io domain %d: glue same "
-				    "page", xbdi->xbdi_domid));
-				panic("notyet!");
-				xbdi->xbdi_same_page = 1;
-			} else {
-				KASSERT(xbdi->xbdi_io->xio_operation ==
-				     BLKIF_OP_READ ||
-				    xbdi->xbdi_io->xio_operation ==
-				     BLKIF_OP_WRITE);
-				xbdi->xbdi_cont_aux = xbdback_co_io_loop;
-				xbdi->xbdi_cont = xbdback_co_map_io;
-				return xbdi;
-			}
-		} else
-			xbdi->xbdi_same_page = 0;
-
-		if (xbdi->xbdi_io == NULL) {
-			xbdi->xbdi_cont = xbdback_co_io_gotio;
-			return xbdback_pool_get(&xbdback_io_pool, xbdi);
-		} else {
-			xbdi->xbdi_cont = xbdback_co_io_gotio2;
-		}
-	} else {
-		/* done with the loop over segments; get next request */
-		xbdi->xbdi_cont = xbdback_co_main_incr;
-	}
-	return xbdi;
-}
-
 /* Prepare an I/O buffer for a xbdback instance */
 static void *
 xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)
 {
 	struct xbdback_io *xbd_io;
-	vaddr_t start_offset; /* start offset in vm area */
 	int buf_flags;
+	size_t bcount;
+	blkif_request_t *req;
 
 	xbdi_get(xbdi);
 	atomic_inc_uint(&xbdi->xbdi_pendingreqs);
 	
+	req = &xbdi->xbdi_xen_req;
 	xbd_io = xbdi->xbdi_io = obj;
+	memset(xbd_io, 0, sizeof(*xbd_io));
 	buf_init(&xbd_io->xio_buf);
 	xbd_io->xio_xbdi = xbdi;
-	SLIST_INIT(&xbd_io->xio_rq);
-	xbd_io->xio_nrma = 0;
-	xbd_io->xio_mapped = 0;
-	xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
+	xbd_io->xio_operation = req->operation;
+	xbd_io->xio_id = req->id;
 
-	start_offset = xbdi->xbdi_this_fs * VBD_BSIZE;
-	KASSERT(start_offset < PAGE_SIZE);
+	/* Process segments */
+	bcount = 0;
+	for (int i = 0; i < req->nr_segments; i++) {
+		xbd_io->xio_gref[i] = req->seg[i].gref;
+		bcount += (req->seg[i].last_sect - req->seg[i].first_sect + 1)
+			* VBD_BSIZE;
+	}
+	KASSERT(bcount <= MAXPHYS);
+	xbd_io->xio_nrma = req->nr_segments;
 
+	xbd_io->xio_start_offset = req->seg[0].first_sect * VBD_BSIZE;
+	KASSERT(xbd_io->xio_start_offset < PAGE_SIZE);
+	KASSERT(bcount + xbd_io->xio_start_offset < VBD_VA_SIZE);
+
 	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
 		buf_flags = B_WRITE;
 	} else {
@@ -1439,97 +1209,11 @@
 	xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
 	xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock;
 	xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
-	xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector;
-	xbd_io->xio_buf.b_bcount = 0;
-	xbd_io->xio_buf.b_data = (void *)start_offset;
+	xbd_io->xio_buf.b_blkno = req->sector_number;
+	xbd_io->xio_buf.b_bcount = bcount;
+	xbd_io->xio_buf.b_data = NULL;
 	xbd_io->xio_buf.b_private = xbd_io;
 
-	xbdi->xbdi_cont = xbdback_co_io_gotio2;
-	return xbdi;
-}
-
-/* Manage fragments */
-static void *
-xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj)
-{
-	(void)obj;
-	if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) {
-		/* if this is the first segment of a new request */
-		/* or if it's the first segment of the io */
-		xbdi->xbdi_cont = xbdback_co_io_gotfrag;
-		return xbdback_pool_get(&xbdback_fragment_pool, xbdi);
-	}
-	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
-	return xbdi;
-}
-
-/* Prepare the instance for its first fragment */
-static void *
-xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj)
-{
-	struct xbdback_fragment *xbd_fr;
-
-	xbd_fr = obj;
-	xbd_fr->car = xbdi->xbdi_req;
-	SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr);
-	++xbdi->xbdi_req->rq_iocount;
-
-	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
-	return xbdi;
-}
-
-/* Last routine to manage segments fragments for one I/O */
-static void *
-xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj)
-{
-	struct xbdback_io *xbd_io;
-	int seg_size;
-	uint8_t this_fs, this_ls;
-
-	this_fs = xbdi->xbdi_this_fs;
-	this_ls = xbdi->xbdi_this_ls;
-	xbd_io = xbdi->xbdi_io;
-	seg_size = this_ls - this_fs + 1;
-
-	if (seg_size < 0) {
-		if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) {
-			printf("xbdback_io domain %d: negative-size request "
-			    "(%d %d)\n",
-			    xbdi->xbdi_domid, this_ls, this_fs);
-		}
-		xbdback_io_error(xbdi->xbdi_io, EINVAL);
-		xbdi->xbdi_io = NULL;
-		xbdi->xbdi_cont = xbdback_co_main_incr;
-		return xbdi;
-	}
-	
-	if (!xbdi->xbdi_same_page) {
-		XENPRINTF(("xbdback_io domain %d: appending grant %u\n",
-			   xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt));
-		xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt;
-	}
-
-	xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE);
-	XENPRINTF(("xbdback_io domain %d: start sect %ld size %d\n",
-	    xbdi->xbdi_domid, (long)xbdi->xbdi_next_sector, seg_size));
-	
-	/* Finally, the end of the segment loop! */
-	xbdi->xbdi_next_sector += seg_size;
-	++xbdi->xbdi_segno;
-	xbdi->xbdi_cont = xbdback_co_io_loop;
-	return xbdi;
-}
-
-/*
- * Map the different I/O requests in backend's VA space.
- */
-static void *
-xbdback_co_map_io(struct xbdback_instance *xbdi, void *obj)
-{
-	(void)obj;
-	XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n",
-	    xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno,
-	    (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io));
 	xbdi->xbdi_cont = xbdback_co_do_io;
 	return xbdback_map_shm(xbdi->xbdi_io);
 }
@@ -1549,7 +1233,6 @@
 xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj)
 {
 	struct xbdback_io *xbd_io = xbdi->xbdi_io;
-	vaddr_t start_offset;
 	int nsegs __diagused;
 
 	switch (xbd_io->xio_operation) {
@@ -1569,7 +1252,7 @@
 				error = BLKIF_RSP_ERROR;
 		} else
 			error = BLKIF_RSP_OKAY;
-		xbdback_send_reply(xbdi, xbd_io->xio_flush_id,
+		xbdback_send_reply(xbdi, xbd_io->xio_id,
 		    xbd_io->xio_operation, error);
 		xbdback_pool_put(&xbdback_io_pool, xbd_io);
 		xbdi_put(xbdi);
@@ -1579,30 +1262,18 @@
 	}
 	case BLKIF_OP_READ:
 	case BLKIF_OP_WRITE:
-		start_offset = (vaddr_t)xbd_io->xio_buf.b_data;
-		KASSERT(xbd_io->xio_buf.b_bcount + start_offset < VBD_VA_SIZE);
 		xbd_io->xio_buf.b_data = (void *)
-		    (start_offset + xbd_io->xio_vaddr);
-#ifdef DIAGNOSTIC
-		nsegs = round_page(start_offset + xbd_io->xio_buf.b_bcount)
-		    >> PAGE_SHIFT;
-		if (nsegs > xbd_io->xio_nrma) {
-			printf("xbdback_co_do_io: vaddr %#" PRIxVADDR
-			    " bcount %#x doesn't fit in %d pages\n",
-			    start_offset, xbd_io->xio_buf.b_bcount,
-			    xbd_io->xio_nrma);
-			panic("xbdback_co_do_io: not enough pages");
-		}
-#endif
+		    (xbd_io->xio_vaddr + xbd_io->xio_start_offset);
+
 		if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
 			mutex_enter(xbd_io->xio_buf.b_vp->v_interlock);
 			xbd_io->xio_buf.b_vp->v_numoutput++;
 			mutex_exit(xbd_io->xio_buf.b_vp->v_interlock);
 		}
-		bdev_strategy(&xbd_io->xio_buf);
 		/* will call xbdback_iodone() asynchronously when done */
+		bdev_strategy(&xbd_io->xio_buf);
 		xbdi->xbdi_io = NULL;
-		xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
+		xbdi->xbdi_cont = xbdback_co_main_incr;
 		return xbdi;
 	default:
 		/* Should never happen */
@@ -1622,7 +1293,7 @@
 {
 	struct xbdback_io *xbd_io;
 	struct xbdback_instance *xbdi;
-	int errp;
+	int status;
 
 	KERNEL_LOCK(1, NULL);		/* XXXSMP */
 
@@ -1632,49 +1303,19 @@
 	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
 		   xbdi->xbdi_domid, (long)xbd_io));
 
-	if (xbd_io->xio_mapped == 1)
+	KASSERT(bp->b_error != 0 || xbd_io->xio_xv != NULL);
+	if (xbd_io->xio_xv != NULL)
 		xbdback_unmap_shm(xbd_io);
 
 	if (bp->b_error != 0) {
 		printf("xbd IO domain %d: error %d\n",
 		       xbdi->xbdi_domid, bp->b_error);
-		errp = 1;
+		status = BLKIF_RSP_ERROR;
 	} else
-		errp = 0;
+		status = BLKIF_RSP_OKAY;
 	
-	/* for each constituent xbd request */
-	while(!SLIST_EMPTY(&xbd_io->xio_rq)) {
-		struct xbdback_fragment *xbd_fr;
-		struct xbdback_request *xbd_req;
-		struct xbdback_instance *rxbdi __diagused;
-		int error;
-		
-		xbd_fr = SLIST_FIRST(&xbd_io->xio_rq);
-		xbd_req = xbd_fr->car;
-		SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr);
-		xbdback_pool_put(&xbdback_fragment_pool, xbd_fr);
-		
-		if (errp)
-			++xbd_req->rq_ioerrs;
-		
-		/* finalize it only if this was its last I/O */
-		if (--xbd_req->rq_iocount > 0)
-			continue;
+	xbdback_send_reply(xbdi, xbd_io->xio_id, xbd_io->xio_operation, status);
 
-		rxbdi = xbd_req->rq_xbdi;
-		KASSERT(xbdi == rxbdi);
-		
-		error = xbd_req->rq_ioerrs > 0
-		    ? BLKIF_RSP_ERROR
-		    : BLKIF_RSP_OKAY;
-
-		XENPRINTF(("xbdback_io domain %d: end request %"PRIu64
-		    " error=%d\n",
-		    xbdi->xbdi_domid, xbd_req->rq_id, error));
-		xbdback_send_reply(xbdi, xbd_req->rq_id,
-		    xbd_req->rq_operation, error);
-		xbdback_pool_put(&xbdback_request_pool, xbd_req);
-	}
 	xbdi_put(xbdi);
 	atomic_dec_uint(&xbdi->xbdi_pendingreqs);
 	buf_destroy(&xbd_io->xio_buf);
@@ -1759,7 +1400,6 @@
 xbdback_map_shm(struct xbdback_io *xbd_io)
 {
 	struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
-	struct xbdback_request *xbd_rq;
 	int error, s;
 
 #ifdef XENDEBUG_VBD
@@ -1770,11 +1410,7 @@
 	}
 #endif
 
-	KASSERT(xbd_io->xio_mapped == 0);
-
 	s = splvm();	/* XXXSMP */
-	xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
-
 	xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free);
 	KASSERT(xbd_io->xio_xv != NULL);
 	SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next);
@@ -1783,7 +1419,7 @@
 
 	error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid,
 	    xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh, 
-	    (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0);
+	    (xbd_io->xio_operation == BLKIF_OP_WRITE) ? XSHM_RO : 0);
 
 	switch(error) {
 	case 0:
@@ -1794,7 +1430,6 @@
 		}
 		printf("\n");
 #endif
-		xbd_io->xio_mapped = 1;
 		return xbdi;
 	default:
 		if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) {
@@ -1825,8 +1460,7 @@
 	printf("\n");
 #endif
 
-	KASSERT(xbd_io->xio_mapped == 1);
-	xbd_io->xio_mapped = 0;
+	KASSERT(xbd_io->xio_xv != NULL);
 	xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma,
 	    xbd_io->xio_gh);
 	SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
@@ -1836,17 +1470,17 @@
 
 /* Obtain memory from a pool */
 static void *
-xbdback_pool_get(struct xbdback_pool *pp,
+xbdback_pool_get(struct pool_cache *pc,
 			      struct xbdback_instance *xbdi)
 {
-	return pool_cache_get(&pp->pc, PR_WAITOK);
+	return pool_cache_get(pc, PR_WAITOK);
 }
 
 /* Restore memory to a pool */
 static void
-xbdback_pool_put(struct xbdback_pool *pp, void *item)
+xbdback_pool_put(struct pool_cache *pc, void *item)
 {
-	pool_cache_put(&pp->pc, item);
+	pool_cache_put(pc, item);
 }
 
 /*