shithub: riscv

Download patch

ref: f58d99aa7a97ba5f79af89f38b78d5924d4e35a2
parent: c3589ef3cf33189d342a3ab638b558fd9249b220
author: cinap_lenrek <[email protected]>
date: Sun Jul 11 07:24:13 EDT 2021

virtio: add non-legacy virtio 1.0 drivers for disk and ethernet

The new interface uses pci capability structures to locate the
registers in a rather fine granular way making it more complicated
as they can be located anywhere in any pci bar at any offset.

As far as i can see, qemu (6.0.50) never uses i/o bars in
non-legacy mode, so only mmio is implemented for now.

The previous virtio drivers implemented the legacy interface only
which uses i/o ports for all register accesses. This is still
the preferred method (and also qemu default) as it is easier to
emulate and most likely faster.

However, some vps providers like vultr force the legacy interface
to disabled with qemu -device option "disable-legacy=on" resulting
on a system without a disk and ethernet.

--- a/sys/src/9/pc/ethervirtio.c
+++ b/sys/src/9/pc/ethervirtio.c
@@ -1,3 +1,7 @@
+/*
+ * virtio ethernet driver implementing the legacy interface:
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ */
 #include "u.h"
 #include "../port/lib.h"
 #include "mem.h"
@@ -9,11 +13,6 @@
 #include "../port/netif.h"
 #include "../port/etherif.h"
 
-/*
- * virtio ethernet driver
- * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
- */
-
 typedef struct Vring Vring;
 typedef struct Vdesc Vdesc;
 typedef struct Vused Vused;
@@ -555,13 +554,14 @@
 	h = t = nil;
 
 	/* §4.1.2 PCI Device Discovery */
-	for(p = nil; p = pcimatch(p, 0, 0);){
-		if(p->vid != 0x1AF4)
-			continue;
+	for(p = nil; p = pcimatch(p, 0x1AF4, 0);){
 		/* the two possible DIDs for virtio-net */
 		if(p->did != 0x1000 && p->did != 0x1041)
 			continue;
-		/* non-transitional devices will have a revision > 0 */
+		/*
+		 * non-transitional devices will have a revision > 0,
+		 * these are handled by ethervirtio10 driver.
+		 */
 		if(p->rid != 0)
 			continue;
 		/* first membar needs to be I/O */
@@ -588,6 +588,8 @@
 
 		/* §3.1.2 Legacy Device Initialization */
 		outb(c->port+Qstatus, 0);
+		while(inb(c->port+Qstatus) != 0)
+			delay(1);
 		outb(c->port+Qstatus, Sacknowledge|Sdriver);
 
 		/* negotiate feature bits */
--- /dev/null
+++ b/sys/src/9/pc/ethervirtio10.c
@@ -1,0 +1,790 @@
+/*
+ * virtio 1.0 ethernet driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to ethervirtio.c, this driver handles the non-legacy
+ * interface for virtio ethernet which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ * 
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "../port/error.h"
+#include "../port/netif.h"
+#include "../port/etherif.h"
+
+typedef struct Vconfig Vconfig;
+typedef struct Vnetcfg Vnetcfg;
+
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vheader Vheader;
+typedef struct Vqueue Vqueue;
+
+typedef struct Ctlr Ctlr;
+
+enum {
+	/* §2.1 Device Status Field */
+	Sacknowledge = 1,
+	Sdriver = 2,
+	Sdriverok = 4,
+	Sfeatureok = 8,
+	Sfailed = 128,
+
+	/* flags in Qnetstatus */
+	Nlinkup = (1<<0),
+	Nannounce = (1<<1),
+
+	/* feat[0] bits */
+	Fmac = 1<<5,
+	Fstatus = 1<<16,
+	Fctrlvq = 1<<17,
+	Fctrlrx = 1<<18,
+
+	/* feat[1] bits */
+	Fversion1 = 1<<(32-32),
+
+	/* vring used flags */
+	Unonotify = 1,
+	/* vring avail flags */
+	Rnointerrupt = 1,
+
+	/* descriptor flags */
+	Dnext = 1,
+	Dwrite = 2,
+	Dindirect = 4,
+
+	/* struct sizes */
+	VringSize = 4,
+	VdescSize = 16,
+	VusedSize = 8,
+	VheaderSize = 12,
+
+	Vrxq	= 0,
+	Vtxq	= 1,
+	Vctlq	= 2,
+
+	/* class/cmd for Vctlq */
+	CtrlRx	= 0x00,
+		CmdPromisc	= 0x00,
+		CmdAllmulti	= 0x01,
+	CtrlMac	= 0x01,
+		CmdMacTableSet	= 0x00,
+	CtrlVlan= 0x02,
+		CmdVlanAdd	= 0x00,
+		CmdVlanDel	= 0x01,
+};
+
+struct Vconfig {
+	u32int	devfeatsel;
+	u32int	devfeat;
+	u32int	drvfeatsel;
+	u32int	drvfeat;
+
+	u16int	msixcfg;
+	u16int	nqueues;
+
+	u8int	status;
+	u8int	cfggen;
+	u16int	queuesel;
+
+	u16int	queuesize;
+	u16int	queuemsixvect;
+
+	u16int	queueenable;
+	u16int	queuenotifyoff;
+
+	u64int	queuedesc;
+	u64int	queueavail;
+	u64int	queueused;
+};
+
+struct Vnetcfg
+{
+	u16int	mac0;
+	u16int	mac1;
+	u16int	mac2;
+	u16int	status;
+	u16int	maxqueuepairs;
+	u16int	mtu;
+};
+
+struct Vring
+{
+	u16int	flags;
+	u16int	idx;
+};
+
+struct Vdesc
+{
+	u64int	addr;
+	u32int	len;
+	u16int	flags;
+	u16int	next;
+};
+
+struct Vused
+{
+	u32int	id;
+	u32int	len;
+};
+
+struct Vheader
+{
+	u8int	flags;
+	u8int	segtype;
+	u16int	hlen;
+	u16int	seglen;
+	u16int	csumstart;
+	u16int	csumend;
+};
+
+struct Vqueue
+{
+	Rendez;
+
+	uint	qsize;
+	uint	qmask;
+
+	Vdesc	*desc;
+
+	Vring	*avail;
+	u16int	*availent;
+	u16int	*availevent;
+
+	Vring	*used;
+	Vused	*usedent;
+	u16int	*usedevent;
+	u16int	lastused;
+
+	uint	nintr;
+	uint	nnote;
+
+	/* notify register */
+	void	*notify;
+};
+
+struct Ctlr {
+	Lock;
+
+	QLock	ctllock;
+
+	int	attached;
+
+	/* registers */
+	Vconfig	*cfg;
+	Vnetcfg *dev;
+	u8int	*isr;
+	u8int	*notify;
+	u32int	notifyoffmult;
+
+	uvlong	port;
+	Pcidev	*pcidev;
+	Ctlr	*next;
+	int	active;
+	ulong	feat[2];
+	int	nqueue;
+
+	/* virtioether has 3 queues: rx, tx and ctl */
+	Vqueue	queue[3];
+};
+
+static Ctlr *ctlrhead;
+
+static int
+vhasroom(void *v)
+{
+	Vqueue *q = v;
+	return q->lastused != q->used->idx;
+}
+
+static void
+vqnotify(Ctlr *ctlr, int x)
+{
+	Vqueue *q;
+
+	coherence();
+	q = &ctlr->queue[x];
+	if(q->used->flags & Unonotify)
+		return;
+	q->nnote++;
+	*((u16int*)q->notify) = x;
+}
+
+static void
+txproc(void *v)
+{
+	Vheader *header;
+	Block **blocks;
+	Ether *edev;
+	Ctlr *ctlr;
+	Vqueue *q;
+	Vused *u;
+	Block *b;
+	int i, j;
+
+	edev = v;
+	ctlr = edev->ctlr;
+	q = &ctlr->queue[Vtxq];
+
+	header = smalloc(VheaderSize);
+	blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+	for(i = 0; i < q->qsize/2; i++){
+		j = i << 1;
+		q->desc[j].addr = PADDR(header);
+		q->desc[j].len = VheaderSize;
+		q->desc[j].next = j | 1;
+		q->desc[j].flags = Dnext;
+
+		q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+		j |= 1;
+		q->desc[j].next = 0;
+		q->desc[j].flags = 0;
+	}
+
+	q->avail->flags &= ~Rnointerrupt;
+
+	while(waserror())
+		;
+
+	while((b = qbread(edev->oq, 1000000)) != nil){
+		for(;;){
+			/* retire completed packets */
+			while((i = q->lastused) != q->used->idx){
+				u = &q->usedent[i & q->qmask];
+				i = (u->id & q->qmask) >> 1;
+				if(blocks[i] == nil)
+					break;
+				freeb(blocks[i]);
+				blocks[i] = nil;
+				q->lastused++;
+			}
+
+			/* have free slot? */
+			i = q->avail->idx & (q->qmask >> 1);
+			if(blocks[i] == nil)
+				break;
+
+			/* ring full, wait and retry */
+			if(!vhasroom(q))
+				sleep(q, vhasroom, q);
+		}
+
+		/* slot is free, fill in descriptor */
+		blocks[i] = b;
+		j = (i << 1) | 1;
+		q->desc[j].addr = PADDR(b->rp);
+		q->desc[j].len = BLEN(b);
+		coherence();
+		q->avail->idx++;
+		vqnotify(ctlr, Vtxq);
+	}
+
+	pexit("ether out queue closed", 1);
+}
+
+static void
+rxproc(void *v)
+{
+	Vheader *header;
+	Block **blocks;
+	Ether *edev;
+	Ctlr *ctlr;
+	Vqueue *q;
+	Vused *u;
+	Block *b;
+	int i, j;
+
+	edev = v;
+	ctlr = edev->ctlr;
+	q = &ctlr->queue[Vrxq];
+
+	header = smalloc(VheaderSize);
+	blocks = smalloc(sizeof(Block*) * (q->qsize/2));
+
+	for(i = 0; i < q->qsize/2; i++){
+		j = i << 1;
+		q->desc[j].addr = PADDR(header);
+		q->desc[j].len = VheaderSize;
+		q->desc[j].next = j | 1;
+		q->desc[j].flags = Dwrite|Dnext;
+
+		q->availent[i] = q->availent[i + q->qsize/2] = j;
+
+		j |= 1;
+		q->desc[j].next = 0;
+		q->desc[j].flags = Dwrite;
+	}
+
+	q->avail->flags &= ~Rnointerrupt;
+
+	while(waserror())
+		;
+
+	for(;;){
+		/* replenish receive ring */
+		do {
+			i = q->avail->idx & (q->qmask >> 1);
+			if(blocks[i] != nil)
+				break;
+			if((b = iallocb(ETHERMAXTU)) == nil)
+				break;
+			blocks[i] = b;
+			j = (i << 1) | 1;
+			q->desc[j].addr = PADDR(b->rp);
+			q->desc[j].len = BALLOC(b);
+			coherence();
+			q->avail->idx++;
+		} while(q->avail->idx != q->used->idx);
+		vqnotify(ctlr, Vrxq);
+
+		/* wait for any packets to complete */
+		if(!vhasroom(q))
+			sleep(q, vhasroom, q);
+
+		/* retire completed packets */
+		while((i = q->lastused) != q->used->idx) {
+			u = &q->usedent[i & q->qmask];
+			i = (u->id & q->qmask) >> 1;
+			if((b = blocks[i]) == nil)
+				break;
+
+			blocks[i] = nil;
+			b->wp = b->rp + u->len - VheaderSize;
+			etheriq(edev, b);
+			q->lastused++;
+		}
+	}
+}
+
+static int
+vctlcmd(Ether *edev, uchar class, uchar cmd, uchar *data, int ndata)
+{
+	uchar hdr[2], ack[1];
+	Ctlr *ctlr;
+	Vqueue *q;
+	Vdesc *d;
+	int i;
+
+	ctlr = edev->ctlr;
+	q = &ctlr->queue[Vctlq];
+	if(q->qsize < 3)
+		return -1;
+
+	qlock(&ctlr->ctllock);
+	while(waserror())
+		;
+
+	ack[0] = 0x55;
+	hdr[0] = class;
+	hdr[1] = cmd;
+
+	d = &q->desc[0];
+	d->addr = PADDR(hdr);
+	d->len = sizeof(hdr);
+	d->next = 1;
+	d->flags = Dnext;
+	d++;
+	d->addr = PADDR(data);
+	d->len = ndata;
+	d->next = 2;
+	d->flags = Dnext;
+	d++;
+	d->addr = PADDR(ack);
+	d->len = sizeof(ack);
+	d->next = 0;
+	d->flags = Dwrite;
+
+	i = q->avail->idx & q->qmask;
+	q->availent[i] = 0;
+	coherence();
+
+	q->avail->flags &= ~Rnointerrupt;
+	q->avail->idx++;
+	vqnotify(ctlr, Vctlq);
+	while(!vhasroom(q))
+		sleep(q, vhasroom, q);
+	q->lastused = q->used->idx;
+	q->avail->flags |= Rnointerrupt;
+
+	qunlock(&ctlr->ctllock);
+	poperror();
+
+	if(ack[0] != 0)
+		print("#l%d: vctlcmd: %ux.%ux -> %ux\n", edev->ctlrno, class, cmd, ack[0]);
+
+	return ack[0];
+}
+
+static void
+interrupt(Ureg*, void* arg)
+{
+	Ether *edev;
+	Ctlr *ctlr;
+	Vqueue *q;
+	int i;
+
+	edev = arg;
+	ctlr = edev->ctlr;
+	if(*ctlr->isr & 1){
+		for(i = 0; i < ctlr->nqueue; i++){
+			q = &ctlr->queue[i];
+			if(vhasroom(q)){
+				q->nintr++;
+				wakeup(q);
+			}
+		}
+	}
+}
+
+static void
+attach(Ether* edev)
+{
+	char name[KNAMELEN];
+	Ctlr* ctlr;
+	int i;
+
+	ctlr = edev->ctlr;
+	ilock(ctlr);
+	if(ctlr->attached){
+		iunlock(ctlr);
+		return;
+	}
+	ctlr->attached = 1;
+
+	/* driver is ready */
+	ctlr->cfg->status |= Sdriverok;
+
+	/* enable the queues */
+	for(i = 0; i < ctlr->nqueue; i++){
+		ctlr->cfg->queuesel = i;
+		ctlr->cfg->queueenable = 1;
+	}
+	iunlock(ctlr);
+
+	/* start kprocs */
+	snprint(name, sizeof name, "#l%drx", edev->ctlrno);
+	kproc(name, rxproc, edev);
+	snprint(name, sizeof name, "#l%dtx", edev->ctlrno);
+	kproc(name, txproc, edev);
+}
+
+static long
+ifstat(Ether *edev, void *a, long n, ulong offset)
+{
+	int i, l;
+	char *p;
+	Ctlr *ctlr;
+	Vqueue *q;
+
+	ctlr = edev->ctlr;
+
+	p = smalloc(READSTR);
+
+	l = snprint(p, READSTR, "devfeat %32.32lub %32.32lub\n", ctlr->feat[1], ctlr->feat[0]);
+	l += snprint(p+l, READSTR-l, "devstatus %8.8ub\n", ctlr->cfg->status);
+
+	for(i = 0; i < ctlr->nqueue; i++){
+		q = &ctlr->queue[i];
+		l += snprint(p+l, READSTR-l,
+			"vq%d %#p size %d avail->idx %d used->idx %d lastused %hud nintr %ud nnote %ud\n",
+			i, q, q->qsize, q->avail->idx, q->used->idx, q->lastused, q->nintr, q->nnote);
+	}
+
+	n = readstr(offset, a, n, p);
+	free(p);
+
+	return n;
+}
+
+static void
+shutdown(Ether* edev)
+{
+	Ctlr *ctlr = edev->ctlr;
+
+	coherence();
+	ctlr->cfg->status = 0;
+	coherence();
+
+	pciclrbme(ctlr->pcidev);
+}
+
+static void
+promiscuous(void *arg, int on)
+{
+	Ether *edev = arg;
+	uchar b[1];
+
+	b[0] = on != 0;
+	vctlcmd(edev, CtrlRx, CmdPromisc, b, sizeof(b));
+}
+
+static void
+multicast(void *arg, uchar*, int)
+{
+	Ether *edev = arg;
+	uchar b[1];
+
+	b[0] = edev->nmaddr > 0;
+	vctlcmd(edev, CtrlRx, CmdAllmulti, b, sizeof(b));
+}
+
+static int
+initqueue(Vqueue *q, int size)
+{
+	uchar *p;
+
+	q->desc = mallocalign(VdescSize*size, 16, 0, 0);
+	if(q->desc == nil)
+		return -1;
+	p = mallocalign(VringSize + 2*size + 2, 2, 0, 0);
+	if(p == nil){
+FreeDesc:
+		free(q->desc);
+		q->desc = nil;
+		return -1;
+	}
+	q->avail = (void*)p;
+	p += VringSize;
+	q->availent = (void*)p;
+	p += sizeof(u16int)*size;
+	q->availevent = (void*)p;
+	p = mallocalign(VringSize + VusedSize*size + 2, 4, 0, 0);
+	if(p == nil){
+		free(q->avail);
+		q->avail = nil;
+		goto FreeDesc;
+	}
+	q->used = (void*)p;
+	p += VringSize;
+	q->usedent = (void*)p;
+	p += VusedSize*size;
+	q->usedevent = (void*)p;
+
+	q->qsize = size;
+	q->qmask = q->qsize - 1;
+
+	q->lastused = q->avail->idx = q->used->idx = 0;
+
+	q->avail->flags |= Rnointerrupt;
+
+	return 0;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+	int bar;
+
+	if(cap != 9 || pcicfgr8(p, off+3) != typ)
+		return 1;
+
+	/* skip invalid or non memory bars */
+	bar = pcicfgr8(p, off+4);
+	if(bar < 0 || bar >= nelem(p->mem) 
+	|| p->mem[bar].size == 0
+	|| (p->mem[bar].bar & 3) != 0)
+		return 1;
+
+	return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+	return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+	int bar, len;
+	uvlong addr;
+
+	if(cap < 0)
+		return nil;
+	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+	addr = pcicfgr32(p, cap+8);
+	len = pcicfgr32(p, cap+12);
+	if(size <= 0)
+		size = len;
+	else if(len < size)
+		return nil;
+	if(addr+len > p->mem[bar].size)
+		return nil;
+	addr += p->mem[bar].bar & ~0xFULL;
+	return vmap(addr, size);
+}
+
+static Ctlr*
+pciprobe(void)
+{
+	Ctlr *c, *h, *t;
+	Pcidev *p;
+	Vconfig *cfg;
+	int bar, cap, n, i;
+
+	h = t = nil;
+
+	/* §4.1.2 PCI Device Discovery */
+	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1041);){
+		/* non-transitional devices will have a revision > 0 */
+		if(p->rid == 0)
+			continue;
+		if((cap = virtiocap(p, 1)) < 0)
+			continue;
+		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+		if(cfg == nil)
+			continue;
+		if((c = mallocz(sizeof(Ctlr), 1)) == nil){
+			print("ethervirtio: no memory for Ctlr\n");
+			break;
+		}
+		c->cfg = cfg;
+		c->pcidev = p;
+		c->port = p->mem[bar].bar & ~0xFULL;
+
+		pcienable(p);
+		c->dev = virtiomapregs(p, virtiocap(p, 4), sizeof(Vnetcfg));
+		if(c->dev == nil)
+			goto Baddev;
+		c->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+		if(c->isr == nil)
+			goto Baddev;
+		cap = virtiocap(p, 2);
+		c->notify = virtiomapregs(p, cap, 0);
+		if(c->notify == nil)
+			goto Baddev;
+		c->notifyoffmult = pcicfgr32(p, cap+16);
+
+		/* device reset */
+		coherence();
+		cfg->status = 0;
+		while(cfg->status != 0)
+			delay(1);
+		cfg->status = Sacknowledge|Sdriver;
+
+		/* negotiate feature bits */
+		cfg->devfeatsel = 1;
+		c->feat[1] = cfg->devfeat;
+
+		cfg->devfeatsel = 0;
+		c->feat[0] = cfg->devfeat;
+
+		cfg->drvfeatsel = 1;
+		cfg->drvfeat = c->feat[1] & Fversion1;
+
+		cfg->drvfeatsel = 0;
+		cfg->drvfeat = c->feat[0] & (Fmac|Fctrlvq|Fctrlrx);
+
+		for(i=0; i<nelem(c->queue); i++){
+			cfg->queuesel = i;
+			n = cfg->queuesize;
+			if(n == 0 || (n & (n-1)) != 0){
+				if(i < 2)
+					print("ethervirtio: queue %d has invalid size %d\n", i, n);
+				break;
+			}
+			if(initqueue(&c->queue[i], n) < 0)
+				break;
+			c->queue[i].notify = c->notify + c->notifyoffmult * cfg->queuenotifyoff;
+			coherence();
+			cfg->queuedesc = PADDR(c->queue[i].desc);
+			cfg->queueavail = PADDR(c->queue[i].avail);
+			cfg->queueused = PADDR(c->queue[i].used);
+		}
+		if(i < 2){
+			print("ethervirtio: no queues\n");
+Baddev:
+			pcidisable(p);
+			/* TODO, vunmap */
+			free(c);
+			continue;
+		}
+		c->nqueue = i;		
+
+		if(h == nil)
+			h = c;
+		else
+			t->next = c;
+		t = c;
+	}
+
+	return h;
+}
+
+
+static int
+reset(Ether* edev)
+{
+	static uchar zeros[Eaddrlen];
+	Ctlr *ctlr;
+	int i;
+
+	if(ctlrhead == nil)
+		ctlrhead = pciprobe();
+
+	for(ctlr = ctlrhead; ctlr != nil; ctlr = ctlr->next){
+		if(ctlr->active)
+			continue;
+		if(edev->port == 0 || edev->port == ctlr->port){
+			ctlr->active = 1;
+			break;
+		}
+	}
+
+	if(ctlr == nil)
+		return -1;
+
+	edev->ctlr = ctlr;
+	edev->port = ctlr->port;
+	edev->irq = ctlr->pcidev->intl;
+	edev->tbdf = ctlr->pcidev->tbdf;
+	edev->mbps = 1000;
+	edev->link = 1;
+
+	if((ctlr->feat[0] & Fmac) != 0 && memcmp(edev->ea, zeros, Eaddrlen) == 0){
+		for(i = 0; i < Eaddrlen; i++)
+			edev->ea[i] = ((uchar*)ctlr->dev)[i];
+	} else {
+		for(i = 0; i < Eaddrlen; i++)
+			((uchar*)ctlr->dev)[i] = edev->ea[i];
+	}
+
+	edev->arg = edev;
+
+	edev->attach = attach;
+	edev->shutdown = shutdown;
+	edev->ifstat = ifstat;
+
+	if((ctlr->feat[0] & (Fctrlvq|Fctrlrx)) == (Fctrlvq|Fctrlrx)){
+		edev->multicast = multicast;
+		edev->promiscuous = promiscuous;
+	}
+
+	pcisetbme(ctlr->pcidev);
+	intrenable(edev->irq, interrupt, edev, edev->tbdf, edev->name);
+
+	return 0;
+}
+
+void
+ethervirtio10link(void)
+{
+	addethercard("virtio10", reset);
+}
--- a/sys/src/9/pc/pc
+++ b/sys/src/9/pc/pc
@@ -80,6 +80,7 @@
 	etherwpi	pci wifi
 	etherrt2860	pci wifi
 	ethervirtio	pci
+	ethervirtio10	pci
 	ethermedium
 	pcmciamodem
 	netdevmedium
@@ -108,6 +109,7 @@
 	sdiahci		pci sdscsi led
 	sdodin		pci sdscsi led
 	sdvirtio	pci sdscsi
+	sdvirtio10	pci sdscsi
 	sdmmc		pci pmmc
 	sdnvme		pci
 	sdloop
--- a/sys/src/9/pc/sdvirtio.c
+++ b/sys/src/9/pc/sdvirtio.c
@@ -1,3 +1,7 @@
+/*
+ * virtio ethernet driver implementing the legacy interface:
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ */
 #include "u.h"
 #include "../port/lib.h"
 #include "mem.h"
--- /dev/null
+++ b/sys/src/9/pc/sdvirtio10.c
@@ -1,0 +1,808 @@
+/*
+ * virtio 1.0 disk driver
+ * http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
+ *
+ * In contrast to sdvirtio.c, this driver handles the non-legacy
+ * interface for virtio disk which uses mmio for all register accesses
+ * and requires a laborate pci capability structure dance to get working.
+ *
+ * It is kind of pointless as it is most likely slower than
+ * port i/o (harder to emulate on the pc platform).
+ * 
+ * The reason why this driver is needed it is that vultr set the
+ * disable-legacy=on option in the -device parameter for qemu
+ * on their hypervisor.
+ */
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "../port/pci.h"
+#include "ureg.h"
+#include "../port/error.h"
+
+#include "../port/sd.h"
+
+typedef struct Vscsidev Vscsidev;
+typedef struct Vblkdev Vblkdev;
+
+typedef struct Vconfig Vconfig;
+typedef struct Vring Vring;
+typedef struct Vdesc Vdesc;
+typedef struct Vused Vused;
+typedef struct Vqueue Vqueue;
+typedef struct Vdev Vdev;
+
+
+/* device types */
+enum {
+	TypBlk	= 2,
+	TypSCSI	= 8,
+};
+
+/* status flags */
+enum {
+	Acknowledge = 1,
+	Driver = 2,
+	DriverOk = 4,
+	Failed = 0x80,
+};
+
+/* descriptor flags */
+enum {
+	Next = 1,
+	Write = 2,
+	Indirect = 4,
+};
+
+/* struct sizes */
+enum {
+	VringSize = 4,
+};
+
+enum {
+	CDBSIZE		= 32,
+	SENSESIZE	= 96,
+};
+
+	
+struct Vscsidev
+{
+	u32int	num_queues;
+	u32int	seg_max;
+	u32int	max_sectors;
+	u32int	cmd_per_lun;
+	u32int	event_info_size;
+	u32int	sense_size;
+	u32int	cdb_size;
+	u16int	max_channel;
+	u16int	max_target;
+	u32int	max_lun;
+};
+
+struct Vblkdev
+{
+	u64int	capacity;
+};
+
+struct Vconfig {
+	u32int	devfeatsel;
+	u32int	devfeat;
+	u32int	drvfeatsel;
+	u32int	drvfeat;
+
+	u16int	msixcfg;
+	u16int	nqueues;
+
+	u8int	status;
+	u8int	cfggen;
+	u16int	queuesel;
+
+	u16int	queuesize;
+	u16int	queuemsixvect;
+
+	u16int	queueenable;
+	u16int	queuenotifyoff;
+
+	u64int	queuedesc;
+	u64int	queueavail;
+	u64int	queueused;
+};
+
+struct Vring
+{
+	u16int	flags;
+	u16int	idx;
+};
+
+struct Vdesc
+{
+	u64int	addr;
+	u32int	len;
+	u16int	flags;
+	u16int	next;
+};
+
+struct Vused
+{
+	u32int	id;
+	u32int	len;
+};
+
+struct Vqueue
+{
+	Lock;
+
+	Vdev	*dev;
+	void	*notify;
+	int	idx;
+
+	int	size;
+
+	int	free;
+	int	nfree;
+
+	Vdesc	*desc;
+
+	Vring	*avail;
+	u16int	*availent;
+	u16int	*availevent;
+
+	Vring	*used;
+	Vused	*usedent;
+	u16int	*usedevent;
+	u16int	lastused;
+
+	void	*rock[];
+};
+
+struct Vdev
+{
+	int	typ;
+
+	Pcidev	*pci;
+
+	uvlong	port;
+	ulong	feat[2];
+
+	int	nqueue;
+	Vqueue	*queue[16];
+
+	void	*dev;	/* device specific config (for scsi) */
+
+	/* registers */
+	Vconfig	*cfg;
+	u8int	*isr;
+	u8int	*notify;
+	u32int	notifyoffmult;
+
+	Vdev	*next;
+};
+
+static Vqueue*
+mkvqueue(int size)
+{
+	Vqueue *q;
+	uchar *p;
+	int i;
+
+	q = malloc(sizeof(*q) + sizeof(void*)*size);
+	p = mallocalign(
+		PGROUND(sizeof(Vdesc)*size + 
+			VringSize + 
+			sizeof(u16int)*size + 
+			sizeof(u16int)) +
+		PGROUND(VringSize + 
+			sizeof(Vused)*size + 
+			sizeof(u16int)), 
+		BY2PG, 0, 0);
+	if(p == nil || q == nil){
+		print("virtio: no memory for Vqueue\n");
+		free(p);
+		free(q);
+		return nil;
+	}
+
+	q->desc = (void*)p;
+	p += sizeof(Vdesc)*size;
+	q->avail = (void*)p;
+	p += VringSize;
+	q->availent = (void*)p;
+	p += sizeof(u16int)*size;
+	q->availevent = (void*)p;
+	p += sizeof(u16int);
+
+	p = (uchar*)PGROUND((uintptr)p);
+	q->used = (void*)p;
+	p += VringSize;
+	q->usedent = (void*)p;
+	p += sizeof(Vused)*size;
+	q->usedevent = (void*)p;
+
+	q->free = -1;
+	q->nfree = q->size = size;
+	for(i=0; i<size; i++){
+		q->desc[i].next = q->free;
+		q->free = i;
+	}
+
+	return q;
+}
+
+static int
+matchvirtiocfgcap(Pcidev *p, int cap, int off, int typ)
+{
+	int bar;
+
+	if(cap != 9 || pcicfgr8(p, off+3) != typ)
+		return 1;
+
+	/* skip invalid or non memory bars */
+	bar = pcicfgr8(p, off+4);
+	if(bar < 0 || bar >= nelem(p->mem) 
+	|| p->mem[bar].size == 0
+	|| (p->mem[bar].bar & 3) != 0)
+		return 1;
+
+	return 0;
+}
+
+static int
+virtiocap(Pcidev *p, int typ)
+{
+	return pcienumcaps(p, matchvirtiocfgcap, typ);
+}
+
+static void*
+virtiomapregs(Pcidev *p, int cap, int size)
+{
+	int bar, len;
+	uvlong addr;
+
+	if(cap < 0)
+		return nil;
+	bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+	addr = pcicfgr32(p, cap+8);
+	len = pcicfgr32(p, cap+12);
+	if(size <= 0)
+		size = len;
+	else if(len < size)
+		return nil;
+	if(addr+len > p->mem[bar].size)
+		return nil;
+	addr += p->mem[bar].bar & ~0xFULL;
+	return vmap(addr, size);
+}
+
+static Vdev*
+viopnpdevs(int typ)
+{
+	Vdev *vd, *h, *t;
+	Vconfig *cfg;
+	Vqueue *q;
+	Pcidev *p;
+	int cap, bar;
+	int n, i;
+
+	h = t = nil;
+	for(p = nil; p = pcimatch(p, 0x1AF4, 0x1040+typ);){
+		if(p->rid == 0)
+			continue;
+		if((cap = virtiocap(p, 1)) < 0)
+			continue;
+		bar = pcicfgr8(p, cap+4) % nelem(p->mem);
+		cfg = virtiomapregs(p, cap, sizeof(Vconfig));
+		if(cfg == nil)
+			continue;
+		if((vd = malloc(sizeof(*vd))) == nil){
+			print("virtio: no memory for Vdev\n");
+			break;
+		}
+		vd->port = p->mem[bar].bar & ~0xFULL;
+		vd->typ = typ;
+		vd->pci = p;
+		vd->cfg = cfg;
+		pcienable(p);
+
+		vd->isr = virtiomapregs(p, virtiocap(p, 3), 0);
+		if(vd->isr == nil){
+Baddev:
+			pcidisable(p);
+			/* TODO: vunmap */
+			free(vd);
+			continue;
+		}
+		cap = virtiocap(p, 2);
+		vd->notify = virtiomapregs(p, cap, 0);
+		if(vd->notify == nil)
+			goto Baddev;
+		vd->notifyoffmult = pcicfgr32(p, cap+16);
+
+		/* reset */
+		cfg->status = 0;
+		while(cfg->status != 0)
+			delay(1);
+		cfg->status = Acknowledge|Driver;
+
+		/* negotiate feature bits */
+		cfg->devfeatsel = 1;
+		vd->feat[1] = cfg->devfeat;
+		cfg->devfeatsel = 0;
+		vd->feat[0] = cfg->devfeat;
+		cfg->drvfeatsel = 1;
+		cfg->drvfeat = vd->feat[1] & 1;
+		cfg->drvfeatsel = 0;
+		cfg->drvfeat = 0;
+
+		for(i=0; i<nelem(vd->queue); i++){
+			cfg->queuesel = i;
+			n = cfg->queuesize;
+			if(n == 0 || (n & (n-1)) != 0)
+				break;
+			if((q = mkvqueue(n)) == nil)
+				break;
+			q->notify = vd->notify + vd->notifyoffmult * cfg->queuenotifyoff;
+			q->dev = vd;
+			q->idx = i;
+			vd->queue[i] = q;
+			coherence();
+			cfg->queuedesc = PADDR(q->desc);
+			cfg->queueavail = PADDR(q->avail);
+			cfg->queueused = PADDR(q->used);
+		}
+		vd->nqueue = i;
+	
+		if(h == nil)
+			h = vd;
+		else
+			t->next = vd;
+		t = vd;
+	}
+
+	return h;
+}
+
+struct Rock {
+	int done;
+	Rendez *sleep;
+};
+
+static void
+vqinterrupt(Vqueue *q)
+{
+	int id, free, m;
+	struct Rock *r;
+	Rendez *z;
+
+	m = q->size-1;
+
+	ilock(q);
+	while((q->lastused ^ q->used->idx) & m){
+		id = q->usedent[q->lastused++ & m].id;
+		if(r = q->rock[id]){
+			q->rock[id] = nil;
+			z = r->sleep;
+			r->done = 1;	/* hands off */
+			if(z != nil)
+				wakeup(z);
+		}
+		do {
+			free = id;
+			id = q->desc[free].next;
+			q->desc[free].next = q->free;
+			q->free = free;
+			q->nfree++;
+		} while(q->desc[free].flags & Next);
+	}
+	iunlock(q);
+}
+
+static void
+viointerrupt(Ureg *, void *arg)
+{
+	Vdev *vd = arg;
+
+	if(vd->isr[0] & 1)
+		vqinterrupt(vd->queue[vd->typ == TypSCSI ? 2 : 0]);
+}
+
+static int
+viodone(void *arg)
+{
+	return ((struct Rock*)arg)->done;
+}
+
+static void
+vqio(Vqueue *q, int head)
+{
+	struct Rock rock;
+
+	rock.done = 0;
+	rock.sleep = &up->sleep;
+	q->rock[head] = &rock;
+	q->availent[q->avail->idx & (q->size-1)] = head;
+	coherence();
+	q->avail->idx++;
+	iunlock(q);
+	if((q->used->flags & 1) == 0)
+		*((u16int*)q->notify) = q->idx;
+	while(!rock.done){
+		while(waserror())
+			;
+		tsleep(rock.sleep, viodone, &rock, 1000);
+		poperror();
+
+		if(!rock.done)
+			vqinterrupt(q);
+	}
+}
+
+static int
+vioblkreq(Vdev *vd, int typ, void *a, long count, long secsize, uvlong lba)
+{
+	int need, free, head;
+	Vqueue *q;
+	Vdesc *d;
+
+	u8int status;
+	struct Vioblkreqhdr {
+		u32int	typ;
+		u32int	prio;
+		u64int	lba;
+	} req;
+
+	need = 2;
+	if(a != nil)
+		need = 3;
+
+	status = -1;
+	req.typ = typ;
+	req.prio = 0;
+	req.lba = lba;
+
+	q = vd->queue[0];
+	ilock(q);
+	while(q->nfree < need){
+		iunlock(q);
+
+		if(!waserror())
+			tsleep(&up->sleep, return0, 0, 500);
+		poperror();
+
+		ilock(q);
+	}
+
+	head = free = q->free;
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(&req);
+	d->len = sizeof(req);
+	d->flags = Next;
+
+	if(a != nil){
+		d = &q->desc[free]; free = d->next;
+		d->addr = PADDR(a);
+		d->len = secsize*count;
+		d->flags = typ ? Next : (Write|Next);
+	}
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(&status);
+	d->len = sizeof(status);
+	d->flags = Write;
+
+	q->free = free;
+	q->nfree -= need;
+
+	/* queue io, unlock and wait for completion */
+	vqio(q, head);
+
+	return status;
+}
+
+static int
+vioscsireq(SDreq *r)
+{
+	u8int resp[4+4+2+2+SENSESIZE];
+	u8int req[8+8+3+CDBSIZE];
+	int free, head;
+	u32int len;
+	Vqueue *q;
+	Vdesc *d;
+	Vdev *vd;
+	SDunit *u;
+	Vscsidev *scsi;
+
+	u = r->unit;
+	vd = u->dev->ctlr;
+	scsi = vd->dev;
+
+	memset(resp, 0, sizeof(resp));
+	memset(req, 0, sizeof(req));
+	req[0] = 1;
+	req[1] = u->subno;
+	req[2] = r->lun>>8;
+	req[3] = r->lun&0xFF;
+	*(u64int*)(&req[8]) = (uintptr)r;
+
+	memmove(&req[8+8+3], r->cmd, r->clen);
+
+	q = vd->queue[2];
+	ilock(q);
+	while(q->nfree < 3){
+		iunlock(q);
+
+		if(!waserror())
+			tsleep(&up->sleep, return0, 0, 500);
+		poperror();
+
+		ilock(q);
+	}
+
+	head = free = q->free;
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(req);
+	d->len = 8+8+3+scsi->cdb_size;
+	d->flags = Next;
+
+	if(r->write && r->dlen > 0){
+		d = &q->desc[free]; free = d->next;
+		d->addr = PADDR(r->data);
+		d->len = r->dlen;
+		d->flags = Next;
+	}
+
+	d = &q->desc[free]; free = d->next;
+	d->addr = PADDR(resp);
+	d->len = 4+4+2+2+scsi->sense_size;
+	d->flags = Write;
+
+	if(!r->write && r->dlen > 0){
+		d->flags |= Next;
+
+		d = &q->desc[free]; free = d->next;
+		d->addr = PADDR(r->data);
+		d->len = r->dlen;
+		d->flags = Write;
+	}
+	
+	q->free = free;
+	q->nfree -= 2 + (r->dlen > 0);
+
+	/* queue io, unlock and wait for completion */
+	vqio(q, head);
+
+	/* response+status */
+	r->status = resp[10];
+	if(resp[11] != 0)
+		r->status = SDcheck;
+
+	/* sense_len */
+	len = *((u32int*)&resp[0]);
+	if(len > 0){
+		if(len > sizeof(r->sense))
+			len = sizeof(r->sense);
+		memmove(r->sense, &resp[4+4+2+2], len);
+		r->flags |= SDvalidsense;
+	}
+
+	/* data residue */
+	len = *((u32int*)&resp[4]);
+	if(len > r->dlen)
+		r->rlen = 0;
+	else
+		r->rlen = r->dlen - len;
+
+	return r->status;
+
+}
+
+static long
+viobio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
+{
+	long ss, cc, max, ret;
+	Vdev *vd;
+
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return scsibio(u, lun, write, a, count, lba);
+
+	max = 32;
+	ss = u->secsize;
+	ret = 0;
+	while(count > 0){
+		if((cc = count) > max)
+			cc = max;
+		if(vioblkreq(vd, write != 0, (uchar*)a + ret, cc, ss, lba) != 0)
+			error(Eio);
+		ret += cc*ss;
+		count -= cc;
+		lba += cc;
+	}
+	return ret;
+}
+
+static int
+viorio(SDreq *r)
+{
+	int i, count, rw;
+	uvlong lba;
+	SDunit *u;
+	Vdev *vd;
+
+	u = r->unit;
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return vioscsireq(r);
+	if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){
+		if(vioblkreq(vd, 4, nil, 0, 0, 0) != 0)
+			return sdsetsense(r, SDcheck, 3, 0xc, 2);
+		return sdsetsense(r, SDok, 0, 0, 0);
+	}
+	if((i = sdfakescsi(r)) != SDnostatus)
+		return r->status = i;
+	if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
+		return i;
+	r->rlen = viobio(u, r->lun, rw == SDwrite, r->data, count, lba);
+	return r->status = SDok;
+}
+
+static int
+vioonline(SDunit *u)
+{
+	Vdev *vd;
+	Vblkdev *blk;
+	uvlong cap;
+
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return scsionline(u);
+
+	blk = vd->dev;
+	cap = blk->capacity;
+	if(u->sectors != cap){
+		u->sectors = cap;
+		u->secsize = 512;
+		return 2;
+	}
+	return 1;
+}
+
+static int
+vioverify(SDunit *u)
+{
+	Vdev *vd;
+
+	vd = u->dev->ctlr;
+	if(vd->typ == TypSCSI)
+		return scsiverify(u);
+
+	return 1;
+}
+
+SDifc sdvirtio10ifc;
+
+static int
+vioenable(SDev *sd)
+{
+	char name[32];
+	Vdev *vd;
+	int i;
+
+	vd = sd->ctlr;
+	pcisetbme(vd->pci);
+	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+	intrenable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+	coherence();
+
+	vd->cfg->status |= DriverOk;
+	for(i = 0; i < vd->nqueue; i++){
+		vd->cfg->queuesel = i;
+		vd->cfg->queueenable = 1;
+	}
+
+	return 1;
+}
+
+static int
+viodisable(SDev *sd)
+{
+	char name[32];
+	Vdev *vd;
+
+	vd = sd->ctlr;
+	snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+	intrdisable(vd->pci->intl, viointerrupt, vd, vd->pci->tbdf, name);
+	pciclrbme(vd->pci);
+	return 1;
+}
+
+static SDev*
+viopnp(void)
+{
+	SDev *s, *h, *t;
+	Vdev *vd;
+	int id;
+
+	h = t = nil;
+
+	id = 'F';
+	for(vd =  viopnpdevs(TypBlk); vd; vd = vd->next){
+		if(vd->nqueue == 0)
+			continue;
+
+		if((vd->dev = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vblkdev))) == nil)
+			break;
+		if((s = malloc(sizeof(*s))) == nil)
+			break;
+		s->ctlr = vd;
+		s->idno = id++;
+		s->ifc = &sdvirtio10ifc;
+		s->nunit = 1;
+		if(h)
+			t->next = s;
+		else
+			h = s;
+		t = s;
+	}
+
+	id = '0';
+	for(vd = viopnpdevs(TypSCSI); vd; vd = vd->next){
+		Vscsidev *scsi;
+
+		if(vd->nqueue < 3)
+			continue;
+
+		if((scsi = virtiomapregs(vd->pci, virtiocap(vd->pci, 4), sizeof(Vscsidev))) == nil)
+			break;
+		if(scsi->max_target == 0){
+			vunmap(scsi, sizeof(Vscsidev));
+			continue;
+		}
+		if((scsi->cdb_size > CDBSIZE) || (scsi->sense_size > SENSESIZE)){
+			print("sdvirtio: cdb %ud or sense size %ud too big\n",
+				scsi->cdb_size, scsi->sense_size);
+			vunmap(scsi, sizeof(Vscsidev));
+			continue;
+		}
+		vd->dev = scsi;
+
+		if((s = malloc(sizeof(*s))) == nil)
+			break;
+		s->ctlr = vd;
+		s->idno = id++;
+		s->ifc = &sdvirtio10ifc;
+		s->nunit = scsi->max_target;
+
+		if(h)
+			t->next = s;
+		else
+			h = s;
+		t = s;
+	}
+	return h;
+}
+
+SDifc sdvirtio10ifc = {
+	"virtio10",			/* name */
+
+	viopnp,				/* pnp */
+	nil,				/* legacy */
+	vioenable,			/* enable */
+	viodisable,			/* disable */
+
+	vioverify,			/* verify */
+	vioonline,			/* online */
+	viorio,				/* rio */
+	nil,				/* rctl */
+	nil,				/* wctl */
+
+	viobio,				/* bio */
+	nil,				/* probe */
+	nil,				/* clear */
+	nil,				/* rtopctl */
+	nil,				/* wtopctl */
+};
--- a/sys/src/9/pc64/pc64
+++ b/sys/src/9/pc64/pc64
@@ -78,6 +78,7 @@
 	etherwpi	pci wifi
 	etherrt2860	pci wifi
 	ethervirtio	pci
+	ethervirtio10	pci
 	ethermedium
 #	pcmciamodem
 	netdevmedium
@@ -105,6 +106,7 @@
 	sdiahci		pci sdscsi led
 #	sdodin		pci sdscsi led
 	sdvirtio	pci sdscsi
+	sdvirtio10	pci sdscsi
 	sdmmc		pci pmmc
 	sdnvme		pci
 	sdloop