ref: bfae9e08be692b944ab3018d98693a15ca38a64c
parent: 83dd98022de08fd6b7f9f041814ea40fa566a1b3
author: cinap_lenrek <[email protected]>
date: Tue Mar 28 20:21:35 EDT 2017
sdnvme: NVMe controller driver (work in progress) basic NVMe controller driver, reads and writes work. "namespaces" show up as logical units. uses pin/msi interrupts (no msi-x support yet). one submission queue per cpu, shared completion queue. no recovery from fatal controller errors. only tested in qemu (no hardware available). commiting this so it can be found by someone who has hardware.
--- /dev/null
+++ b/sys/src/9/pc/sdnvme.c
@@ -1,0 +1,663 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "ureg.h"
+#include "../port/error.h"
+
+#include "../port/sd.h"
+
+typedef struct WS WS;
+typedef struct CQ CQ;
+typedef struct SQ SQ;
+typedef struct Ctlr Ctlr;
+
+struct WS
+{
+ u32int cdw0;
+ ushort status;
+ Rendez *sleep;
+ WS **link;
+ SQ *queue;
+};
+
+struct CQ
+{
+ u32int head;
+ u32int mask;
+ u32int shift;
+ u32int *base;
+ Ctlr *ctlr;
+};
+
+struct SQ
+{
+ u32int tail;
+ u32int mask;
+ u32int shift;
+ u32int *base;
+ WS **wait;
+ Ctlr *ctlr;
+};
+
+struct Ctlr
+{
+ QLock;
+
+ Lock intr;
+ u32int ints;
+ u32int irqc[2];
+
+ Pcidev *pci;
+ u32int *reg;
+
+ u64int cap;
+ uchar *ident;
+ u32int *nsid;
+ int nnsid;
+
+ u32int mps; /* mps = 1<<mpsshift */
+ u32int mpsshift;
+ u32int dstrd;
+
+ CQ cq[1+1];
+ SQ sq[1+MAXMACH];
+
+ Ctlr *next;
+};
+
+/* controller registers */
+enum {
+ Cap0,
+ Cap1,
+ Ver,
+ IntMs,
+ IntMc,
+ CCfg,
+
+ CSts = 0x1C/4,
+ Nssr,
+ AQAttr,
+ ASQBase0,
+ ASQBase1,
+ ACQBase0,
+ ACQBase1,
+
+ DBell = 0x1000/4,
+};
+
+static u32int*
+qcmd(WS *ws, Ctlr *ctlr, int adm, u32int opc, u32int nsid, void *mptr, void *data, ulong len)
+{
+ u32int cid, *e;
+ u64int pa;
+ SQ *sq;
+
+ if(!adm){
+ Retry:
+ splhi();
+ sq = &ctlr->sq[1+m->machno];
+ } else {
+ qlock(ctlr);
+ sq = &ctlr->sq[0];
+ }
+ ws->sleep = &up->sleep;
+ ws->queue = sq;
+ ws->link = &sq->wait[sq->tail & sq->mask];
+ while(*ws->link != nil){
+ sched();
+ if(!adm){
+ /* should be very rare */
+ goto Retry;
+ }
+ }
+ *ws->link = ws;
+
+ e = &sq->base[((cid = sq->tail++) & sq->mask)<<4];
+ e[0] = opc | cid<<16;
+ e[1] = nsid;
+ e[2] = 0;
+ e[3] = 0;
+ if(mptr != nil){
+ pa = PADDR(mptr);
+ e[4] = pa;
+ e[5] = pa>>32;
+ } else {
+ e[4] = 0;
+ e[5] = 0;
+ }
+ if(len > 0){
+ pa = PADDR(data);
+ e[6] = pa;
+ e[7] = pa>>32;
+ if(len > ctlr->mps - (pa & ctlr->mps-1))
+ pa += ctlr->mps - (pa & ctlr->mps-1);
+ else
+ pa = 0;
+ } else {
+ e[6] = 0;
+ e[7] = 0;
+ pa = 0;
+ }
+ e[8] = pa;
+ e[9] = pa>>32;
+ return e;
+}
+
+static void
+nvmeintr(Ureg *, void *arg)
+{
+ u32int phaseshift, *e;
+ WS *ws, **wp;
+ Ctlr *ctlr;
+ SQ *sq;
+ CQ *cq;
+
+ ctlr = arg;
+ if(ctlr->ints == 0)
+ return;
+
+ ilock(&ctlr->intr);
+ ctlr->reg[IntMs] = ctlr->ints;
+ for(cq = &ctlr->cq[nelem(ctlr->cq)-1]; cq >= ctlr->cq; cq--){
+ if(cq->base == nil)
+ continue;
+ phaseshift = 16 - cq->shift;
+ for(;; cq->head++){
+ e = &cq->base[(cq->head & cq->mask)<<2];
+ if(((e[3] ^ (cq->head << phaseshift)) & 0x10000) == 0)
+ break;
+
+ if(0) iprint("nvmeintr: cq%d [%.4ux] %.8ux %.8ux %.8ux %.8ux\n",
+ (int)(cq - ctlr->cq), cq->head & cq->mask,
+ e[0], e[1], e[2], e[3]);
+
+ sq = &ctlr->sq[e[2] >> 16];
+ wp = &sq->wait[e[3] & sq->mask];
+ if((ws = *wp) != nil && ws->link == wp){
+ Rendez *z = ws->sleep;
+ ws->cdw0 = e[0];
+ ws->status = e[3]>>17;
+ *wp = nil;
+ wakeup(z);
+ }
+ }
+ ctlr->reg[DBell + ((cq-ctlr->cq)*2+1 << ctlr->dstrd)] = cq->head & cq->mask;
+ }
+ if((ctlr->reg[CSts] & 3) != 1)
+ iprint("nvmeintr: fatal controller error\n");
+ ctlr->reg[IntMc] = ctlr->ints;
+ iunlock(&ctlr->intr);
+}
+
+static int
+wdone(void *arg)
+{
+ WS *ws = arg;
+ return *ws->link != ws;
+}
+
+static u32int
+wcmd(WS *ws)
+{
+ SQ *sq = ws->queue;
+ Ctlr *ctlr = sq->ctlr;
+
+ coherence();
+ ctlr->reg[DBell + ((sq-ctlr->sq)*2+0 << ctlr->dstrd)] = sq->tail & sq->mask;
+ if(sq > ctlr->sq) {
+ assert(sq == &ctlr->sq[1+m->machno]);
+ spllo();
+ } else
+ qunlock(sq->ctlr);
+ while(waserror())
+ ;
+ tsleep(ws->sleep, wdone, ws, 5);
+ while(!wdone(ws)){
+ nvmeintr(nil, ctlr);
+ tsleep(ws->sleep, wdone, ws, 10);
+ }
+ poperror();
+ return ws->status;
+}
+
+void
+checkstatus(u32int status, char *info)
+{
+ if(status == 0)
+ return;
+ snprint(up->genbuf, sizeof(up->genbuf), "%s: status %ux", info, status);
+ error(up->genbuf);
+}
+
+static long
+nvmebio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
+{
+ u32int nsid, s, n, m, *e;
+ Ctlr *ctlr;
+ uchar *p;
+ WS ws;
+
+ USED(lun);
+
+ ctlr = u->dev->ctlr;
+ nsid = ctlr->nsid[u->subno];
+ s = u->secsize;
+ p = a;
+ while(count > 0){
+ m = (2*ctlr->mps - ((uintptr)p & ctlr->mps-1)) / s;
+ if((n = count) > m)
+ n = m;
+ e = qcmd(&ws, ctlr, 0, write ? 0x01 : 0x02, nsid, nil, p, n*s);
+ e[10] = lba;
+ e[11] = lba>>32;
+ e[12] = n-1;
+ e[13] = (count>n)<<6; /* sequential request */
+ e[14] = 0;
+ e[15] = 0;
+ checkstatus(wcmd(&ws), write ? "write" : "read");
+ p += n*s;
+ count -= n;
+ lba += n;
+ }
+ return p - (uchar*)a;
+}
+
+static int
+nvmerio(SDreq *r)
+{
+ int i, count, rw;
+ uvlong lba;
+ SDunit *u;
+
+ u = r->unit;
+ if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91)
+ return sdsetsense(r, SDok, 0, 0, 0);
+ if((i = sdfakescsi(r)) != SDnostatus)
+ return r->status = i;
+ if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
+ return i;
+ r->rlen = nvmebio(u, r->lun, rw == SDwrite, r->data, count, lba);
+ return r->status = SDok;
+}
+
+static int
+nvmeverify(SDunit *u)
+{
+ Ctlr *ctlr = u->dev->ctlr;
+ return u->subno < ctlr->nnsid;
+}
+
+static int
+nvmeonline(SDunit *u)
+{
+ u32int *e, lbaf;
+ uchar *info, *p;
+ Ctlr *ctlr;
+ WS ws;
+
+ if(u->sectors != 0)
+ return 1;
+
+ ctlr = u->dev->ctlr;
+ if((info = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
+ return 0;
+
+ e = qcmd(&ws, ctlr, 1, 0x06, ctlr->nsid[u->subno], nil, info, 0x1000);
+ e[10] = 0; // identify namespace
+ if(wcmd(&ws) != 0){
+ free(info);
+ return 0;
+ }
+ p = info;
+ u->sectors = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24
+ | (u64int)p[4]<<32
+ | (u64int)p[5]<<40
+ | (u64int)p[6]<<48
+ | (u64int)p[7]<<56;
+ p = &info[128 + 4*(info[26]&15)];
+ lbaf = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24;
+ u->secsize = 1<<((lbaf>>16)&0xFF);
+ free(info);
+
+ memset(u->inquiry, 0, sizeof u->inquiry);
+ u->inquiry[2] = 2;
+ u->inquiry[3] = 2;
+ u->inquiry[4] = sizeof u->inquiry - 4;
+ memmove(u->inquiry+8, ctlr->ident+24, 20);
+
+ return 2;
+}
+
+static int
+nvmerctl(SDunit *u, char *p, int l)
+{
+ Ctlr *ctlr;
+ char *e, *s;
+
+ if((ctlr = u->dev->ctlr) == nil || ctlr->ident == nil)
+ return 0;
+
+ e = p+l;
+ s = p;
+
+ p = seprint(p, e, "model\t%.20s\n", (char*)ctlr->ident+24);
+ p = seprint(p, e, "serial\t%.10s\n", (char*)ctlr->ident+4);
+ p = seprint(p, e, "firm\t%.6s\n", (char*)ctlr->ident+64);
+ p = seprint(p, e, "geometry %llud %lud\n", u->sectors, u->secsize);
+
+ return p-s;
+}
+
+static void*
+cqalloc(Ctlr *ctlr, CQ *cq, u32int lgsize)
+{
+ cq->ctlr = ctlr;
+ cq->head = 0;
+ cq->shift = lgsize-4;
+ cq->mask = (1<<cq->shift)-1;
+ if((cq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+ memset(cq->base, 0, 1<<lgsize);
+ return cq->base;
+}
+
+static void*
+sqalloc(Ctlr *ctlr, SQ *sq, u32int lgsize)
+{
+ sq->ctlr = ctlr;
+ sq->tail = 0;
+ sq->shift = lgsize-6;
+ sq->mask = (1<<sq->shift)-1;
+ if((sq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+ if((sq->wait = mallocz(sizeof(WS*)*(sq->mask+1), 1)) == nil)
+ error(Enomem);
+ memset(sq->base, 0, 1<<lgsize);
+ return sq->base;
+}
+
+static void
+setupqueues(Ctlr *ctlr)
+{
+ u32int lgsize, *e;
+ CQ *cq;
+ SQ *sq;
+ WS ws;
+ int i;
+
+ /* Overkill */
+ lgsize = 12-6+4;
+ while(lgsize < 16+4 && lgsize < ctlr->mpsshift && 1<<lgsize < conf.nmach<<12-6+4)
+ lgsize++;
+
+ /* CQID1: shared completion queue */
+ cq = &ctlr->cq[1];
+ cqalloc(ctlr, cq, lgsize);
+ e = qcmd(&ws, ctlr, 1, 0x05, ~0, nil, cq->base, 1<<lgsize);
+ e[10] = (cq - ctlr->cq) | cq->mask<<16;
+ e[11] = 3; /* IEN | PC */
+ checkstatus(wcmd(&ws), "create completion queue");
+
+ /* SQID[1..nmach]: submission queue per cpu */
+ for(i=1; i<=conf.nmach; i++){
+ sq = &ctlr->sq[i];
+ sqalloc(ctlr, sq, 12);
+ e = qcmd(&ws, ctlr, 1, 0x01, ~0, nil, sq->base, 0x1000);
+ e[10] = i | sq->mask<<16;
+ e[11] = (cq - ctlr->cq)<<16 | 1; /* CQID<<16 | PC */
+ checkstatus(wcmd(&ws), "create submission queue");
+ }
+
+ ilock(&ctlr->intr);
+ ctlr->ints |= 1<<(cq - ctlr->cq);
+ ctlr->reg[IntMc] = ctlr->ints;
+ iunlock(&ctlr->intr);
+}
+
+static void
+identify(Ctlr *ctlr)
+{
+ u32int *e;
+ WS ws;
+
+ if(ctlr->ident == nil)
+ if((ctlr->ident = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+ if(ctlr->nsid == nil)
+ if((ctlr->nsid = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
+ error(Enomem);
+
+ e = qcmd(&ws, ctlr, 1, 0x06, ~0, nil, ctlr->ident, 0x1000);
+ e[10] = 1; // identify controller
+ checkstatus(wcmd(&ws), "identify controller");
+
+ e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->nsid, 0x1000);
+ e[10] = 2; // namespace list
+ checkstatus(wcmd(&ws), "namespace list");
+
+ ctlr->nnsid = 0;
+ while(ctlr->nnsid < 1024 && ctlr->nsid[ctlr->nnsid] != 0)
+ ctlr->nnsid++;
+}
+
+static int
+nvmedisable(SDev *sd)
+{
+ char name[32];
+ Ctlr *ctlr;
+ int i;
+
+ ctlr = sd->ctlr;
+
+ /* mask interrupts */
+ ilock(&ctlr->intr);
+ ctlr->ints = 0;
+ ctlr->reg[IntMs] = ~ctlr->ints;
+ iunlock(&ctlr->intr);
+
+ /* disable controller */
+ ctlr->reg[CCfg] = 0;
+
+ for(i = 0; i < 10; i++){
+ if((ctlr->reg[CSts] & 1) == 0)
+ break;
+ tsleep(&up->sleep, return0, nil, 100);
+ }
+
+ snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+ intrdisable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
+
+ pciclrbme(ctlr->pci); /* dma disable */
+
+ for(i=0; i<nelem(ctlr->sq); i++){
+ free(ctlr->sq[i].base);
+ free(ctlr->sq[i].wait);
+ }
+ for(i=0; i<nelem(ctlr->cq); i++)
+ free(ctlr->cq[i].base);
+
+ memset(ctlr->sq, 0, sizeof(ctlr->sq));
+ memset(ctlr->cq, 0, sizeof(ctlr->cq));
+
+ free(ctlr->ident);
+ ctlr->ident = nil;
+ free(ctlr->nsid);
+ ctlr->nsid = nil;
+ ctlr->nnsid = 0;
+
+ return 1;
+}
+
+static int
+nvmeenable(SDev *sd)
+{
+ char name[32];
+ Ctlr *ctlr;
+ u64int pa;
+ int to;
+
+ ctlr = sd->ctlr;
+
+ snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
+ intrenable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
+
+ if(waserror()){
+ print("%s: %s\n", name, up->errstr);
+ nvmedisable(sd);
+ sd->nunit = 0; /* hack: prevent further probing */
+ return 0;
+ }
+
+ pa = PADDR(cqalloc(ctlr, &ctlr->cq[0], ctlr->mpsshift));
+ ctlr->reg[ACQBase0] = pa;
+ ctlr->reg[ACQBase1] = pa>>32;
+
+ pa = PADDR(sqalloc(ctlr, &ctlr->sq[0], ctlr->mpsshift));
+ ctlr->reg[ASQBase0] = pa;
+ ctlr->reg[ASQBase1] = pa>>32;
+
+ ctlr->reg[AQAttr] = ctlr->sq[0].mask | ctlr->cq[0].mask<<16;
+
+ /* dma enable */
+ pcisetbme(ctlr->pci);
+
+ /* enable interrupt */
+ ilock(&ctlr->intr);
+ ctlr->ints = 1;
+ ctlr->reg[IntMc] = ctlr->ints;
+ iunlock(&ctlr->intr);
+
+ /* enable controller */
+ ctlr->reg[CCfg] = 1 | (ctlr->mpsshift-12)<<7 | 6<<16 | 4<<20;
+
+ for(to = (ctlr->cap>>24) & 255; to >= 0; to--){
+ tsleep(&up->sleep, return0, nil, 500);
+ if((ctlr->reg[CSts] & 3) == 1)
+ goto Ready;
+ }
+ if(ctlr->reg[CSts] & 2)
+ error("fatal controller status during initialization");
+ error("controller initialization timeout");
+Ready:
+ identify(ctlr);
+ setupqueues(ctlr);
+
+ poperror();
+
+ return 1;
+}
+
+static Ctlr*
+nvmepnpctlrs(void)
+{
+ Ctlr *ctlr, *h, *t;
+ Pcidev *p;
+ int i;
+
+ h = t = nil;
+ for(p = nil; p = pcimatch(p, 0, 0);){
+ if(p->ccrb != 1 || p->ccru != 8 || p->ccrp != 2)
+ continue;
+ if(p->mem[0].size == 0)
+ continue;
+ if((ctlr = malloc(sizeof(*ctlr))) == nil){
+ print("nvme: no memory for Ctlr\n");
+ break;
+ }
+ ctlr->pci = p;
+ ctlr->reg = vmap(p->mem[0].bar & ~0xF, p->mem[0].size);
+ if(ctlr->reg == nil){
+ print("nvme: can't vmap bar0\n");
+ Bad:
+ if(ctlr->reg != nil)
+ vunmap(ctlr->reg, p->mem[0].size);
+ free(ctlr);
+ continue;
+ }
+ ctlr->cap = ctlr->reg[Cap0];
+ ctlr->cap |= (u64int)ctlr->reg[Cap1]<<32;
+
+ /* mask interrupts */
+ ctlr->ints = 0;
+ ctlr->reg[IntMs] = ~ctlr->ints;
+
+ /* disable controller */
+ ctlr->reg[CCfg] = 0;
+
+ if((ctlr->cap&(1ULL<<37)) == 0){
+ print("nvme: doesnt support NVM commactlr set: %ux\n",
+ (u32int)(ctlr->cap>>37) & 0xFF);
+ goto Bad;
+ }
+
+ /* use 64K page size when possible */
+ ctlr->dstrd = (ctlr->cap >> 32) & 15;
+ for(i = (ctlr->cap >> 48) & 15; i < ((ctlr->cap >> 52) & 15); i++){
+ if(i >= 16-12) /* 64K */
+ break;
+ }
+ ctlr->mpsshift = i+12;
+ ctlr->mps = 1 << ctlr->mpsshift;
+
+ if(h == nil)
+ h = ctlr;
+ else
+ t->next = ctlr;
+ t = ctlr;
+ }
+
+ return h;
+}
+
+SDifc sdnvmeifc;
+
+static SDev*
+nvmepnp(void)
+{
+ SDev *s, *h, *t;
+ Ctlr *ctlr;
+ int id;
+
+ h = t = nil;
+
+ id = 'N';
+ for(ctlr = nvmepnpctlrs(); ctlr != nil; ctlr = ctlr->next){
+ if((s = malloc(sizeof(*s))) == nil)
+ break;
+ s->ctlr = ctlr;
+ s->idno = id++;
+ s->ifc = &sdnvmeifc;
+ s->nunit = 1024;
+ if(h)
+ t->next = s;
+ else
+ h = s;
+ t = s;
+ }
+
+ return h;
+}
+
+SDifc sdnvmeifc = {
+ "nvme", /* name */
+
+ nvmepnp, /* pnp */
+ nil, /* legacy */
+ nvmeenable, /* enable */
+ nvmedisable, /* disable */
+
+ nvmeverify, /* verify */
+ nvmeonline, /* online */
+ nvmerio, /* rio */
+ nvmerctl, /* rctl */
+ nil, /* wctl */
+
+ nvmebio, /* bio */
+ nil, /* probe */
+ nil, /* clear */
+ nil, /* rtopctl */
+ nil, /* wtopctl */
+};