Thu Jul 27 17:56:31 2023 UTC ()
Pull up following revision(s) (requested by bouyer in ticket #248):

	sys/arch/xen/xen/xbd_xenbus.c: revision 1.132 (patch)
	sys/arch/xen/xen/xbd_xenbus.c: revision 1.133 (patch)
	sys/arch/xen/xen/xbd_xenbus.c: revision 1.134 (patch)

The disk size reported in the xenstore is always in XEN_BSIZE units,
not sector-size. Should fix the issue reported by Christian Kujau
on netbsd-users and port-xen.

Also use XEN_BSIZE when computing the number of bytes for format_bytes().
While there note in a comment that sc_sectors is in XEN_BSIZE units

Propoerly handle 4k sector size backends:
- report the backend's sector size to upper layers, not DEV_BSIZE.
  Adjust the number of sectors accordingly.
- Use sc_secsize instead of XEN_BSIZE where appropriate. The sectors numbers
  in I/O requests are still in XEN_BSIZE units, but must be a multiple
  of sc_secsize/XEN_BSIZE.
- As a consequence of previous, the buffer has to be aligned to sc_secsize,
  aligned to XEN_BSIZE may not be enough. This means that we may have to
  xbd_map_align() more buffer, including some without B_PHYS set.
- Add some more DPRINTF lines, related to I/O requests

Tested with a linux dom0.

thanks to Christian Kujau for providing access to his hardware for testing
and debugging.


(martin)
diff -r1.129 -r1.129.20.1 src/sys/arch/xen/xen/xbd_xenbus.c

cvs diff -r1.129 -r1.129.20.1 src/sys/arch/xen/xen/xbd_xenbus.c (expand / switch to context diff)
--- src/sys/arch/xen/xen/xbd_xenbus.c 2020/07/13 21:21:56 1.129
+++ src/sys/arch/xen/xen/xbd_xenbus.c 2023/07/27 17:56:31 1.129.20.1
@@ -1,4 +1,4 @@
-/*      $NetBSD: xbd_xenbus.c,v 1.129 2020/07/13 21:21:56 jdolecek Exp $      */
+/*      $NetBSD: xbd_xenbus.c,v 1.129.20.1 2023/07/27 17:56:31 martin Exp $      */
 
 /*
  * Copyright (c) 2006 Manuel Bouyer.
@@ -50,7 +50,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.129 2020/07/13 21:21:56 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.129.20.1 2023/07/27 17:56:31 martin Exp $");
 
 #include "opt_xen.h"
 
@@ -169,7 +169,7 @@
 #define BLKIF_SHUTDOWN_REMOTE 1 /* backend-initiated shutdown in progress */
 #define BLKIF_SHUTDOWN_LOCAL  2 /* locally-initiated shutdown in progress */
 
-	uint64_t sc_sectors; /* number of sectors for this device */
+	uint64_t sc_sectors; /* number of sc_secsize sectors for this device */
 	u_long sc_secsize; /* sector size */
 	uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */
 	u_long sc_info; /* VDISK_* */
@@ -678,11 +678,10 @@
 		dg = &sc->sc_dksc.sc_dkdev.dk_geom;
 		memset(dg, 0, sizeof(*dg));	
 
-		dg->dg_secperunit = sc->sc_xbdsize;
-		dg->dg_secsize = DEV_BSIZE;
+		dg->dg_secperunit = sc->sc_sectors;
+		dg->dg_secsize = sc->sc_secsize;
 		dg->dg_ntracks = 1;
-		// XXX: Ok to hard-code DEV_BSIZE?
-		dg->dg_nsectors = 1024 * (1024 / dg->dg_secsize);
+		dg->dg_nsectors = (1024 * 1024) / dg->dg_secsize;
 		dg->dg_ncylinders = dg->dg_secperunit / dg->dg_nsectors;
 
 		bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0);
@@ -693,10 +692,10 @@
 		hypervisor_unmask_event(sc->sc_evtchn);
 
 		format_bytes(buf, uimin(9, sizeof(buf)),
-		    sc->sc_sectors * sc->sc_secsize);
+		    sc->sc_sectors * dg->dg_secsize);
 		aprint_normal_dev(sc->sc_dksc.sc_dev,
 				"%s, %d bytes/sect x %" PRIu64 " sectors\n",
-				buf, (int)dg->dg_secsize, sc->sc_xbdsize);
+				buf, (int)dg->dg_secsize, sc->sc_sectors);
 		snprintb(buf, sizeof(buf), BLKIF_FEATURE_BITS,
 		    sc->sc_features);
 		aprint_normal_dev(sc->sc_dksc.sc_dev,
@@ -739,14 +738,6 @@
 		panic("%s: can't read number from %s/virtual-device\n", 
 		    device_xname(sc->sc_dksc.sc_dev),
 		    sc->sc_xbusd->xbusd_otherend);
-	err = xenbus_read_ull(NULL,
-	    sc->sc_xbusd->xbusd_otherend, "sectors", &sectors, 10);
-	if (err)
-		panic("%s: can't read number from %s/sectors\n", 
-		    device_xname(sc->sc_dksc.sc_dev),
-		    sc->sc_xbusd->xbusd_otherend);
-	sc->sc_sectors = sectors;
-
 	err = xenbus_read_ul(NULL,
 	    sc->sc_xbusd->xbusd_otherend, "info", &sc->sc_info, 10);
 	if (err)
@@ -759,6 +750,13 @@
 		panic("%s: can't read number from %s/sector-size\n", 
 		    device_xname(sc->sc_dksc.sc_dev),
 		    sc->sc_xbusd->xbusd_otherend);
+	err = xenbus_read_ull(NULL,
+	    sc->sc_xbusd->xbusd_otherend, "sectors", &sectors, 10);
+	if (err)
+		panic("%s: can't read number from %s/sectors\n", 
+		    device_xname(sc->sc_dksc.sc_dev),
+		    sc->sc_xbusd->xbusd_otherend);
+	sc->sc_sectors = sectors * (uint64_t)XEN_BSIZE / sc->sc_secsize;
 
 	xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
 }
@@ -839,6 +837,8 @@
 		    bp, (long)bp->b_bcount));
 
 		if (bp->b_error != 0 || rep->status != BLKIF_RSP_OKAY) {
+			DPRINTF(("%s: error %d status %d\n", __func__,
+			    bp->b_error, rep->status));
 			bp->b_error = EIO;
 			bp->b_resid = bp->b_bcount;
 		}
@@ -1138,7 +1138,7 @@
 		goto out;
 	}
 
-	if (bp->b_rawblkno < 0 || bp->b_rawblkno > sc->sc_xbdsize) {
+	if (bp->b_rawblkno < 0 || bp->b_rawblkno > sc->sc_sectors) {
 		/* invalid block number */
 		error = EINVAL;
 		goto out;
@@ -1174,7 +1174,7 @@
 	bp->b_resid = bp->b_bcount;
 	xbdreq->req_bp = bp;
 	xbdreq->req_data = bp->b_data;
-	if (__predict_false((vaddr_t)bp->b_data & (XEN_BSIZE - 1))) {
+	if (__predict_false((vaddr_t)bp->b_data & (sc->sc_secsize - 1))) {
 		if (__predict_false(xbd_map_align(sc, xbdreq) != 0)) {
 			DPRINTF(("xbd_diskstart: no align\n"));
 			error = EAGAIN;
@@ -1275,8 +1275,12 @@
 	req->id = req_id;
 	req->operation =
 	    bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
-	req->sector_number = bp->b_rawblkno + (start >> XEN_BSHIFT);
+	req->sector_number = (bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE) +
+	    (start >> XEN_BSHIFT);
 	req->handle = sc->sc_handle;
+	DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n",
+	    __func__, req->id, req->operation, req->sector_number,
+	    req->handle));
 
 	size = uimin(bp->b_bcount - start, XBD_MAX_CHUNK); 
 	for (dmaseg = 0; dmaseg < dmamap->dm_nsegs && size > 0; dmaseg++) {
@@ -1296,9 +1300,9 @@
 		}
 		size -= nbytes;
 
-		KASSERT(((ma & PAGE_MASK) & (XEN_BSIZE - 1)) == 0);
-		KASSERT((nbytes & (XEN_BSIZE - 1)) == 0);
-		KASSERT((size & (XEN_BSIZE - 1)) == 0);
+		KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0);
+		KASSERT((nbytes & (sc->sc_secsize - 1)) == 0);
+		KASSERT((size & (sc->sc_secsize - 1)) == 0);
 		first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT;
 		nsects = nbytes >> XEN_BSHIFT;
 
@@ -1307,6 +1311,8 @@
 		reqseg->last_sect = first_sect + nsects - 1;
 		KASSERT(reqseg->first_sect <= reqseg->last_sect);
 		KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE));
+		DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, segidx,
+		    reqseg->first_sect, reqseg->last_sect));
 
 		reqseg->gref = gntref[dmaseg];
 	}
@@ -1332,8 +1338,11 @@
 	req->operation = BLKIF_OP_INDIRECT;
 	req->indirect_op =
 	    bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
-	req->sector_number = bp->b_rawblkno;
+	req->sector_number = bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE;
 	req->handle = sc->sc_handle;
+	DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n",
+	    __func__, req->id, req->indirect_op, req->sector_number,
+	    req->handle));
 
 	xbdreq->req_indirect = SLIST_FIRST(&sc->sc_indirect_head);
 	KASSERT(xbdreq->req_indirect != NULL);	/* always as many as reqs */
@@ -1347,12 +1356,17 @@
 		ma = ds->ds_addr;
 		nbytes = ds->ds_len;
 
+		KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0);
+		KASSERT((nbytes & (sc->sc_secsize - 1)) == 0);
+
 		first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT;
 		nsects = nbytes >> XEN_BSHIFT;
 
 		reqseg->first_sect = first_sect;
 		reqseg->last_sect = first_sect + nsects - 1;
 		reqseg->gref = xbdreq->req_gntref[dmaseg];
+		DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, dmaseg,
+		    reqseg->first_sect, reqseg->last_sect));
 
 		KASSERT(reqseg->first_sect <= reqseg->last_sect);
 		KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE));
@@ -1368,12 +1382,6 @@
 static int
 xbd_map_align(struct xbd_xenbus_softc *sc, struct xbd_req *req)
 {
-	/*
-	 * Only can get here if this is physio() request, block I/O
-	 * uses DEV_BSIZE-aligned buffers.
-	 */
-	KASSERT((req->req_bp->b_flags & B_PHYS) != 0);
-
 	sc->sc_cnt_map_unalign.ev_count++;
 
 	if (sc->sc_unalign_used) {