1 // Copyright 2017 The Fuchsia Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <assert.h>
6 #include <limits.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <threads.h>
11 
12 #include <ddk/binding.h>
13 #include <ddk/debug.h>
14 #include <ddk/device.h>
15 #include <ddk/driver.h>
16 #include <ddk/io-buffer.h>
17 #include <ddk/mmio-buffer.h>
18 #include <ddk/protocol/block.h>
19 #include <ddk/protocol/pci.h>
20 #include <ddk/protocol/pci-lib.h>
21 
22 #include <hw/reg.h>
23 #include <hw/pci.h>
24 
25 #include <lib/sync/completion.h>
26 
27 #include <zircon/device/block.h>
28 #include <zircon/syscalls.h>
29 #include <zircon/types.h>
30 #include <zircon/listnode.h>
31 
32 #include "nvme-hw.h"
33 
34 #define TXN_FLAG_FAILED 1
35 
36 typedef struct {
37     block_op_t op;
38     list_node_t node;
39     block_impl_queue_callback completion_cb;
40     void* cookie;
41     uint16_t pending_utxns;
42     uint8_t opcode;
43     uint8_t flags;
44 } nvme_txn_t;
45 
46 typedef struct {
47     zx_paddr_t phys;    // io buffer phys base (1 page)
48     void* virt;         // io buffer virt base
49     zx_handle_t pmt;    // pinned memory
50     nvme_txn_t* txn;    // related txn
51     uint16_t id;
52     uint16_t reserved0;
53     uint32_t reserved1;
54 } nvme_utxn_t;
55 
56 #define UTXN_COUNT 63
57 
58 // There's no system constant for this.  Ensure it matches reality.
59 #define PAGE_SHIFT (12ULL)
60 static_assert(PAGE_SIZE == (1ULL << PAGE_SHIFT), "");
61 
62 #define PAGE_MASK (PAGE_SIZE - 1ULL)
63 
64 // Limit maximum transfer size to 1MB which fits comfortably
65 // within our single scatter gather page per utxn setup
66 #define MAX_XFER (1024*1024)
67 
68 // Maximum submission and completion queue item counts, for
69 // queues that are a single page in size.
70 #define SQMAX (PAGE_SIZE / sizeof(nvme_cmd_t))
71 #define CQMAX (PAGE_SIZE / sizeof(nvme_cpl_t))
72 
73 // global driver state bits
74 #define FLAG_IRQ_THREAD_STARTED  0x0001
75 #define FLAG_IO_THREAD_STARTED   0x0002
76 #define FLAG_SHUTDOWN            0x0004
77 
78 #define FLAG_HAS_VWC             0x0100
79 
80 typedef struct {
81     mmio_buffer_t mmio;
82     zx_handle_t irqh;
83     zx_handle_t bti;
84     uint32_t flags;
85     mtx_t lock;
86 
87     // io queue doorbell registers
88     void* io_sq_tail_db;
89     void* io_cq_head_db;
90 
91     nvme_cpl_t* io_cq;
92     nvme_cmd_t* io_sq;
93     uint32_t io_nsid;
94     uint16_t io_cq_head;
95     uint16_t io_cq_toggle;
96     uint16_t io_sq_tail;
97     uint16_t io_sq_head;
98 
99     uint64_t utxn_avail;   // bitmask of available utxns
100 
101     // The pending list is txns that have been received
102     // via nvme_queue() and are waiting for io to start.
103     // The exception is the head of the pending list which may
104     // be partially started, waiting for more utxns to become
105     // available.
106     // The active list consists of txns where all utxns have
107     // been created and we're waiting for them to complete or
108     // error out.
109     list_node_t pending_txns;      // inbound txns to process
110     list_node_t active_txns;       // txns in flight
111 
112     // The io signal completion is signaled from nvme_queue()
113     // or from the irq thread, notifying the io thread that
114     // it has work to do.
115     sync_completion_t io_signal;
116 
117     uint32_t max_xfer;
118     block_info_t info;
119 
120     // admin queue doorbell registers
121     void* io_admin_sq_tail_db;
122     void* io_admin_cq_head_db;
123 
124     // admin queues and state
125     nvme_cpl_t* admin_cq;
126     nvme_cmd_t* admin_sq;
127     uint16_t admin_cq_head;
128     uint16_t admin_cq_toggle;
129     uint16_t admin_sq_tail;
130     uint16_t admin_sq_head;
131 
132     // context for admin transactions
133     // presently we serialize these under the admin_lock
134     mtx_t admin_lock;
135     sync_completion_t admin_signal;
136     nvme_cpl_t admin_result;
137 
138     pci_protocol_t pci;
139     zx_device_t* zxdev;
140 
141     size_t iosz;
142 
143     // source of physical pages for queues and admin commands
144     io_buffer_t iob;
145 
146     thrd_t irqthread;
147     thrd_t iothread;
148 
149     // pool of utxns
150     nvme_utxn_t utxn[UTXN_COUNT];
151 } nvme_device_t;
152 
153 
154 // We break IO transactions down into one or more "micro transactions" (utxn)
155 // based on the transfer limits of the controller, etc.  Each utxn has an
156 // id associated with it, which is used as the command id for the command
157 // queued to the NVME device.  This id is the same as its index into the
158 // pool of utxns and the bitmask of free txns, to simplify management.
159 //
160 // We maintain a pool of 63 of these, which is the number of commands
161 // that can be submitted to NVME via a single page submit queue.
162 //
163 // The utxns are not protected by locks.  Instead, after initialization,
164 // they may only be touched by the io thread, which is responsible for
165 // queueing commands and dequeuing completion messages.
166 
utxn_get(nvme_device_t * nvme)167 static nvme_utxn_t* utxn_get(nvme_device_t* nvme) {
168     uint64_t n = __builtin_ffsll(nvme->utxn_avail);
169     if (n == 0) {
170         return NULL;
171     }
172     n--;
173     nvme->utxn_avail &= ~(1ULL << n);
174     return nvme->utxn + n;
175 }
176 
utxn_put(nvme_device_t * nvme,nvme_utxn_t * utxn)177 static void utxn_put(nvme_device_t* nvme, nvme_utxn_t* utxn) {
178     uint64_t n = utxn->id;
179     nvme->utxn_avail |= (1ULL << n);
180 }
181 
nvme_admin_cq_get(nvme_device_t * nvme,nvme_cpl_t * cpl)182 static zx_status_t nvme_admin_cq_get(nvme_device_t* nvme, nvme_cpl_t* cpl) {
183     if ((readw(&nvme->admin_cq[nvme->admin_cq_head].status) & 1) != nvme->admin_cq_toggle) {
184         return ZX_ERR_SHOULD_WAIT;
185     }
186     *cpl = nvme->admin_cq[nvme->admin_cq_head];
187 
188     // advance the head pointer, wrapping and inverting toggle at max
189     uint16_t next = (nvme->admin_cq_head + 1) & (CQMAX - 1);
190     if ((nvme->admin_cq_head = next) == 0) {
191         nvme->admin_cq_toggle ^= 1;
192     }
193 
194     // note the new sq head reported by hw
195     nvme->admin_sq_head = cpl->sq_head;
196 
197     // ring the doorbell
198     writel(next, nvme->io_admin_cq_head_db);
199     return ZX_OK;
200 }
201 
nvme_admin_sq_put(nvme_device_t * nvme,nvme_cmd_t * cmd)202 static zx_status_t nvme_admin_sq_put(nvme_device_t* nvme, nvme_cmd_t* cmd) {
203     uint16_t next = (nvme->admin_sq_tail + 1) & (SQMAX - 1);
204 
205     // if head+1 == tail: queue is full
206     if (next == nvme->admin_sq_head) {
207         return ZX_ERR_SHOULD_WAIT;
208     }
209 
210     nvme->admin_sq[nvme->admin_sq_tail] = *cmd;
211     nvme->admin_sq_tail = next;
212 
213     // ring the doorbell
214     writel(next, nvme->io_admin_sq_tail_db);
215     return ZX_OK;
216 }
217 
nvme_io_cq_get(nvme_device_t * nvme,nvme_cpl_t * cpl)218 static zx_status_t nvme_io_cq_get(nvme_device_t* nvme, nvme_cpl_t* cpl) {
219     if ((readw(&nvme->io_cq[nvme->io_cq_head].status) & 1) != nvme->io_cq_toggle) {
220         return ZX_ERR_SHOULD_WAIT;
221     }
222     *cpl = nvme->io_cq[nvme->io_cq_head];
223 
224     // advance the head pointer, wrapping and inverting toggle at max
225     uint16_t next = (nvme->io_cq_head + 1) & (CQMAX - 1);
226     if ((nvme->io_cq_head = next) == 0) {
227         nvme->io_cq_toggle ^= 1;
228     }
229 
230     // note the new sq head reported by hw
231     nvme->io_sq_head = cpl->sq_head;
232     return ZX_OK;
233 }
234 
nvme_io_cq_ack(nvme_device_t * nvme)235 static void nvme_io_cq_ack(nvme_device_t* nvme) {
236     // ring the doorbell
237     writel(nvme->io_cq_head, nvme->io_cq_head_db);
238 }
239 
nvme_io_sq_put(nvme_device_t * nvme,nvme_cmd_t * cmd)240 static zx_status_t nvme_io_sq_put(nvme_device_t* nvme, nvme_cmd_t* cmd) {
241     uint16_t next = (nvme->io_sq_tail + 1) & (SQMAX - 1);
242 
243     // if head+1 == tail: queue is full
244     if (next == nvme->io_sq_head) {
245         return ZX_ERR_SHOULD_WAIT;
246     }
247 
248     nvme->io_sq[nvme->io_sq_tail] = *cmd;
249     nvme->io_sq_tail = next;
250 
251     // ring the doorbell
252     writel(next, nvme->io_sq_tail_db);
253     return ZX_OK;
254 }
255 
irq_thread(void * arg)256 static int irq_thread(void* arg) {
257     nvme_device_t* nvme = arg;
258     for (;;) {
259         zx_status_t r;
260         if ((r = zx_interrupt_wait(nvme->irqh, NULL)) != ZX_OK) {
261             zxlogf(ERROR, "nvme: irq wait failed: %d\n", r);
262             break;
263         }
264 
265         nvme_cpl_t cpl;
266         if (nvme_admin_cq_get(nvme, &cpl) == ZX_OK) {
267             nvme->admin_result = cpl;
268             sync_completion_signal(&nvme->admin_signal);
269         }
270 
271         sync_completion_signal(&nvme->io_signal);
272     }
273     return 0;
274 }
275 
nvme_admin_txn(nvme_device_t * nvme,nvme_cmd_t * cmd,nvme_cpl_t * cpl)276 static zx_status_t nvme_admin_txn(nvme_device_t* nvme, nvme_cmd_t* cmd, nvme_cpl_t* cpl) {
277     zx_status_t r;
278     mtx_lock(&nvme->admin_lock);
279     sync_completion_reset(&nvme->admin_signal);
280     if ((r = nvme_admin_sq_put(nvme, cmd)) != ZX_OK) {
281         goto done;
282     }
283     if ((r = sync_completion_wait(&nvme->admin_signal, ZX_SEC(1))) != ZX_OK) {
284         zxlogf(ERROR, "nvme: admin txn: timed out\n");
285         goto done;
286     }
287 
288     unsigned code = NVME_CPL_STATUS_CODE(nvme->admin_result.status);
289     if (code != 0) {
290         zxlogf(ERROR, "nvme: admin txn: nvm error %03x\n", code);
291         r = ZX_ERR_IO;
292     }
293     if (cpl != NULL) {
294         *cpl = nvme->admin_result;
295     }
296 done:
297     mtx_unlock(&nvme->admin_lock);
298     return r;
299 }
300 
txn_complete(nvme_txn_t * txn,zx_status_t status)301 static inline void txn_complete(nvme_txn_t* txn, zx_status_t status) {
302     txn->completion_cb(txn->cookie, status, &txn->op);
303 }
304 
305 // Attempt to generate utxns and queue nvme commands for a txn
306 // Returns true if this could not be completed due to temporary
307 // lack of resources or false if either it succeeded or errored out.
io_process_txn(nvme_device_t * nvme,nvme_txn_t * txn)308 static bool io_process_txn(nvme_device_t* nvme, nvme_txn_t* txn) {
309     zx_handle_t vmo = txn->op.rw.vmo;
310     nvme_utxn_t* utxn;
311     zx_paddr_t* pages;
312     zx_status_t r;
313 
314     for (;;) {
315         // If there are no available utxns, we can't proceed
316         // and we tell the caller to retain the txn (true)
317         if ((utxn = utxn_get(nvme)) == NULL) {
318             return true;
319         }
320 
321         uint32_t blocks = txn->op.rw.length;
322         if (blocks > nvme->max_xfer) {
323             blocks = nvme->max_xfer;
324         }
325 
326         // Total transfer size in bytes
327         size_t bytes = ((size_t) blocks) * ((size_t) nvme->info.block_size);
328 
329         // Page offset of first page of transfer
330         size_t pageoffset = txn->op.rw.offset_vmo & (~PAGE_MASK);
331 
332         // Byte offset into first page of transfer
333         size_t byteoffset = txn->op.rw.offset_vmo & PAGE_MASK;
334 
335         // Total pages mapped / touched
336         size_t pagecount = (byteoffset + bytes + PAGE_MASK) >> PAGE_SHIFT;
337 
338         // read disk (OP_READ) -> memory (PERM_WRITE) or
339         // write memory (PERM_READ) -> disk (OP_WRITE)
340         uint32_t opt = (txn->opcode == NVME_OP_READ) ? ZX_BTI_PERM_WRITE : ZX_BTI_PERM_READ;
341 
342         pages = utxn->virt;
343 
344         if ((r = zx_bti_pin(nvme->bti, opt, vmo, pageoffset, pagecount << PAGE_SHIFT,
345                             pages, pagecount, &utxn->pmt)) != ZX_OK) {
346             zxlogf(ERROR, "nvme: could not pin pages: %d\n", r);
347             break;
348         }
349 
350         nvme_cmd_t cmd;
351         memset(&cmd, 0, sizeof(cmd));
352         cmd.cmd = NVME_CMD_CID(utxn->id) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(txn->opcode);
353         cmd.nsid = 1;
354         cmd.u.rw.start_lba = txn->op.rw.offset_dev;
355         cmd.u.rw.block_count = blocks - 1;
356         // The NVME command has room for two data pointers inline.
357         // The first is always the pointer to the first page where data is.
358         // The second is the second page if pagecount is 2.
359         // The second is the address of an array of page 2..n if pagecount > 2
360         cmd.dptr.prp[0] = pages[0] | byteoffset;
361         if (pagecount == 2) {
362             cmd.dptr.prp[1] = pages[1];
363         } else if (pagecount > 2) {
364             cmd.dptr.prp[1] = utxn->phys + sizeof(uint64_t);
365         }
366 
367         zxlogf(TRACE, "nvme: txn=%p utxn id=%u pages=%zu op=%s\n", txn, utxn->id, pagecount,
368                txn->opcode == NVME_OP_WRITE ? "WR" : "RD");
369         zxlogf(SPEW, "nvme: prp[0]=%016zx prp[1]=%016zx\n", cmd.dptr.prp[0], cmd.dptr.prp[1]);
370         zxlogf(SPEW, "nvme: pages[] = { %016zx, %016zx, %016zx, %016zx, ... }\n",
371                pages[0], pages[1], pages[2], pages[3]);
372 
373         if ((r = nvme_io_sq_put(nvme, &cmd)) != ZX_OK) {
374             zxlogf(ERROR, "nvme: could not submit cmd (txn=%p id=%u)\n", txn, utxn->id);
375             break;
376         }
377 
378         utxn->txn = txn;
379 
380         // keep track of where we are
381         txn->op.rw.offset_dev += blocks;
382         txn->op.rw.offset_vmo += bytes;
383         txn->op.rw.length -= blocks;
384         txn->pending_utxns++;
385 
386         // If there's no more remaining, we're done, and we
387         // move this txn to the active list and tell the
388         // caller not to retain the txn (false)
389         if (txn->op.rw.length == 0) {
390             mtx_lock(&nvme->lock);
391             list_add_tail(&nvme->active_txns, &txn->node);
392             mtx_unlock(&nvme->lock);
393             return false;
394         }
395     }
396 
397     // failure
398     if ((r = zx_pmt_unpin(utxn->pmt)) != ZX_OK) {
399         zxlogf(ERROR, "nvme: cannot unpin io buffer: %d\n", r);
400     }
401     utxn_put(nvme, utxn);
402 
403     mtx_lock(&nvme->lock);
404     txn->flags |= TXN_FLAG_FAILED;
405     if (txn->pending_utxns) {
406         // if there are earlier uncompleted IOs we become active now
407         // and will finish erroring out when they complete
408         list_add_tail(&nvme->active_txns, &txn->node);
409         txn = NULL;
410     }
411     mtx_unlock(&nvme->lock);
412 
413     if (txn != NULL) {
414         txn_complete(txn, ZX_ERR_INTERNAL);
415     }
416 
417     // Either way we tell the caller not to retain the txn (false)
418     return false;
419 }
420 
io_process_txns(nvme_device_t * nvme)421 static void io_process_txns(nvme_device_t* nvme) {
422     nvme_txn_t* txn;
423 
424     for (;;) {
425         mtx_lock(&nvme->lock);
426         txn = list_remove_head_type(&nvme->pending_txns, nvme_txn_t, node);
427         mtx_unlock(&nvme->lock);
428 
429         if (txn == NULL) {
430             return;
431         }
432 
433         if (io_process_txn(nvme, txn)) {
434             // put txn back at front of queue for further processing later
435             mtx_lock(&nvme->lock);
436             list_add_head(&nvme->pending_txns, &txn->node);
437             mtx_unlock(&nvme->lock);
438             return;
439         }
440     }
441 }
442 
io_process_cpls(nvme_device_t * nvme)443 static void io_process_cpls(nvme_device_t* nvme) {
444     bool ring_doorbell = false;
445     nvme_cpl_t cpl;
446 
447     while (nvme_io_cq_get(nvme, &cpl) == ZX_OK) {
448         ring_doorbell = true;
449 
450         if (cpl.cmd_id >= UTXN_COUNT) {
451             zxlogf(ERROR, "nvme: unexpected cmd id %u\n", cpl.cmd_id);
452             continue;
453         }
454         nvme_utxn_t* utxn = nvme->utxn + cpl.cmd_id;
455         nvme_txn_t* txn = utxn->txn;
456 
457         if (txn == NULL) {
458             zxlogf(ERROR, "nvme: inactive utxn #%u completed?!\n", cpl.cmd_id);
459             continue;
460         }
461 
462         uint32_t code = NVME_CPL_STATUS_CODE(cpl.status);
463         if (code != 0) {
464             zxlogf(ERROR, "nvme: utxn #%u txn %p failed: status=%03x\n",
465                    cpl.cmd_id, txn, code);
466             txn->flags |= TXN_FLAG_FAILED;
467             // discard any remaining bytes -- no reason to keep creating
468             // further utxns once one has failed
469             txn->op.rw.length = 0;
470         } else {
471             zxlogf(SPEW, "nvme: utxn #%u txn %p OKAY\n", cpl.cmd_id, txn);
472         }
473 
474         zx_status_t r;
475         if ((r = zx_pmt_unpin(utxn->pmt)) != ZX_OK) {
476             zxlogf(ERROR, "nvme: cannot unpin io buffer: %d\n", r);
477         }
478 
479         // release the microtransaction
480         utxn->txn = NULL;
481         utxn_put(nvme, utxn);
482 
483         txn->pending_utxns--;
484         if ((txn->pending_utxns == 0) && (txn->op.rw.length == 0)) {
485             // remove from either pending or active list
486             mtx_lock(&nvme->lock);
487             list_delete(&txn->node);
488             mtx_unlock(&nvme->lock);
489             zxlogf(TRACE, "nvme: txn %p %s\n", txn, txn->flags & TXN_FLAG_FAILED ? "error" : "okay");
490             txn_complete(txn, txn->flags & TXN_FLAG_FAILED ? ZX_ERR_IO : ZX_OK);
491         }
492     }
493 
494     if (ring_doorbell) {
495         nvme_io_cq_ack(nvme);
496     }
497 }
498 
io_thread(void * arg)499 static int io_thread(void* arg) {
500     nvme_device_t* nvme = arg;
501     for (;;) {
502         if (sync_completion_wait(&nvme->io_signal, ZX_TIME_INFINITE)) {
503             break;
504         }
505         if (nvme->flags & FLAG_SHUTDOWN) {
506             //TODO: cancel out pending IO
507             zxlogf(INFO, "nvme: io thread exiting\n");
508             break;
509         }
510 
511         sync_completion_reset(&nvme->io_signal);
512 
513         // process completion messages
514         io_process_cpls(nvme);
515 
516         // process work queue
517         io_process_txns(nvme);
518 
519     }
520     return 0;
521 }
522 
nvme_queue(void * ctx,block_op_t * op,block_impl_queue_callback completion_cb,void * cookie)523 static void nvme_queue(void* ctx, block_op_t* op, block_impl_queue_callback completion_cb,
524                        void* cookie) {
525     nvme_device_t* nvme = ctx;
526     nvme_txn_t* txn = containerof(op, nvme_txn_t, op);
527     txn->completion_cb = completion_cb;
528     txn->cookie = cookie;
529 
530     switch (txn->op.command & BLOCK_OP_MASK) {
531     case BLOCK_OP_READ:
532         txn->opcode = NVME_OP_READ;
533         break;
534     case BLOCK_OP_WRITE:
535         txn->opcode = NVME_OP_WRITE;
536         break;
537     case BLOCK_OP_FLUSH:
538         // TODO
539         txn_complete(txn, ZX_OK);
540         return;
541     default:
542         txn_complete(txn, ZX_ERR_NOT_SUPPORTED);
543         return;
544     }
545 
546     if (txn->op.rw.length == 0) {
547         txn_complete(txn, ZX_ERR_INVALID_ARGS);
548         return;
549     }
550     // Transaction must fit within device
551     if ((txn->op.rw.offset_dev >= nvme->info.block_count) ||
552         (nvme->info.block_count - txn->op.rw.offset_dev < txn->op.rw.length)) {
553         txn_complete(txn, ZX_ERR_OUT_OF_RANGE);
554         return;
555     }
556 
557     // convert vmo offset to a byte offset
558     txn->op.rw.offset_vmo *= nvme->info.block_size;
559 
560     txn->pending_utxns = 0;
561     txn->flags = 0;
562 
563     zxlogf(SPEW, "nvme: io: %s: %ublks @ blk#%zu\n",
564            txn->opcode == NVME_OP_WRITE ? "wr" : "rd",
565            txn->op.rw.length + 1U, txn->op.rw.offset_dev);
566 
567     mtx_lock(&nvme->lock);
568     list_add_tail(&nvme->pending_txns, &txn->node);
569     mtx_unlock(&nvme->lock);
570 
571     sync_completion_signal(&nvme->io_signal);
572 }
573 
nvme_query(void * ctx,block_info_t * info_out,size_t * block_op_size_out)574 static void nvme_query(void* ctx, block_info_t* info_out, size_t* block_op_size_out) {
575     nvme_device_t* nvme = ctx;
576     *info_out = nvme->info;
577     *block_op_size_out = sizeof(nvme_txn_t);
578 }
579 
nvme_ioctl(void * ctx,uint32_t op,const void * cmd,size_t cmdlen,void * reply,size_t max,size_t * out_actual)580 static zx_status_t nvme_ioctl(void* ctx, uint32_t op, const void* cmd, size_t cmdlen, void* reply,
581                               size_t max, size_t* out_actual) {
582     nvme_device_t* nvme = ctx;
583     switch (op) {
584     case IOCTL_BLOCK_GET_INFO: {
585         if (max < sizeof(block_info_t)) {
586             return ZX_ERR_BUFFER_TOO_SMALL;
587         }
588         size_t sz;
589         nvme_query(nvme, reply, &sz);
590         *out_actual = sizeof(block_info_t);
591         return ZX_OK;
592     }
593     default:
594         return ZX_ERR_NOT_SUPPORTED;
595     }
596 }
597 
nvme_get_size(void * ctx)598 static zx_off_t nvme_get_size(void* ctx) {
599     nvme_device_t* nvme = ctx;
600     return nvme->info.block_count * nvme->info.block_size;
601 }
602 
nvme_suspend(void * ctx,uint32_t flags)603 static zx_status_t nvme_suspend(void* ctx, uint32_t flags) {
604     return ZX_OK;
605 }
606 
nvme_resume(void * ctx,uint32_t flags)607 static zx_status_t nvme_resume(void* ctx, uint32_t flags) {
608     return ZX_OK;
609 }
610 
nvme_release(void * ctx)611 static void nvme_release(void* ctx) {
612     nvme_device_t* nvme = ctx;
613     int r;
614 
615     zxlogf(INFO, "nvme: release\n");
616     nvme->flags |= FLAG_SHUTDOWN;
617     if (nvme->mmio.vmo != ZX_HANDLE_INVALID) {
618         pci_enable_bus_master(&nvme->pci, false);
619         zx_handle_close(nvme->bti);
620         mmio_buffer_release(&nvme->mmio);
621         // TODO: risks a handle use-after-close, will be resolved by IRQ api
622         // changes coming soon
623         zx_handle_close(nvme->irqh);
624     }
625     if (nvme->flags & FLAG_IRQ_THREAD_STARTED) {
626         thrd_join(nvme->irqthread, &r);
627     }
628     if (nvme->flags & FLAG_IO_THREAD_STARTED) {
629         sync_completion_signal(&nvme->io_signal);
630         thrd_join(nvme->iothread, &r);
631     }
632 
633     // error out any pending txns
634     mtx_lock(&nvme->lock);
635     nvme_txn_t* txn;
636     while ((txn = list_remove_head_type(&nvme->active_txns, nvme_txn_t, node)) != NULL) {
637         txn_complete(txn, ZX_ERR_PEER_CLOSED);
638     }
639     while ((txn = list_remove_head_type(&nvme->pending_txns, nvme_txn_t, node)) != NULL) {
640         txn_complete(txn, ZX_ERR_PEER_CLOSED);
641     }
642     mtx_unlock(&nvme->lock);
643 
644     io_buffer_release(&nvme->iob);
645     free(nvme);
646 }
647 
648 static zx_protocol_device_t device_ops = {
649     .version = DEVICE_OPS_VERSION,
650 
651     .ioctl = nvme_ioctl,
652     .get_size = nvme_get_size,
653 
654     .suspend = nvme_suspend,
655     .resume = nvme_resume,
656     .release = nvme_release,
657 };
658 
infostring(const char * prefix,uint8_t * str,size_t len)659 static void infostring(const char* prefix, uint8_t* str, size_t len) {
660     char tmp[len + 1];
661     size_t i;
662     for (i = 0; i < len; i++) {
663         uint8_t c = str[i];
664         if (c == 0) {
665             break;
666         }
667         if ((c < ' ') || (c > 127)) {
668             c = ' ';
669         }
670         tmp[i] = c;
671     }
672     tmp[i] = 0;
673     while (i > 0) {
674         i--;
675         if (tmp[i] == ' ') {
676             tmp[i] = 0;
677         } else {
678             break;
679         }
680     }
681     zxlogf(INFO, "nvme: %s'%s'\n", prefix, tmp);
682 }
683 
684 // Convenience accessors for BAR0 registers
685 #define rd32(r) readl(nvme->mmio.vaddr + NVME_REG_##r)
686 #define rd64(r) readll(nvme->mmio.vaddr + NVME_REG_##r)
687 #define wr32(v,r) writel(v, nvme->mmio.vaddr + NVME_REG_##r)
688 #define wr64(v,r) writell(v, nvme->mmio.vaddr + NVME_REG_##r)
689 
690 // dedicated pages from the page pool
691 #define IDX_ADMIN_SQ   0
692 #define IDX_ADMIN_CQ   1
693 #define IDX_IO_SQ      2
694 #define IDX_IO_CQ      3
695 #define IDX_SCRATCH    4
696 #define IDX_UTXN_POOL  5 // this must always be last
697 
698 #define IO_PAGE_COUNT  (IDX_UTXN_POOL + UTXN_COUNT)
699 
U64(uint8_t * x)700 static inline uint64_t U64(uint8_t* x) {
701     return *((uint64_t*) (void*) x);
702 }
U32(uint8_t * x)703 static inline uint32_t U32(uint8_t* x) {
704     return *((uint32_t*) (void*) x);
705 }
U16(uint8_t * x)706 static inline uint32_t U16(uint8_t* x) {
707     return *((uint16_t*) (void*) x);
708 }
709 
710 #define WAIT_MS 5000
711 
nvme_init(nvme_device_t * nvme)712 static zx_status_t nvme_init(nvme_device_t* nvme) {
713     uint32_t n = rd32(VS);
714     uint64_t cap = rd64(CAP);
715 
716     zxlogf(INFO, "nvme: version %d.%d.%d\n", n >> 16, (n >> 8) & 0xFF, n & 0xFF);
717     zxlogf(INFO, "nvme: page size: (MPSMIN): %u (MPSMAX): %u\n",
718            (unsigned) (1 << NVME_CAP_MPSMIN(cap)),
719            (unsigned) (1 << NVME_CAP_MPSMAX(cap)));
720     zxlogf(INFO, "nvme: doorbell stride: %u\n", (unsigned) (1 << NVME_CAP_DSTRD(cap)));
721     zxlogf(INFO, "nvme: timeout: %u ms\n", (unsigned) (1 << NVME_CAP_TO(cap)));
722     zxlogf(INFO, "nvme: boot partition support (BPS): %c\n", NVME_CAP_BPS(cap) ? 'Y' : 'N');
723     zxlogf(INFO, "nvme: supports NVM command set (CSS:NVM): %c\n", NVME_CAP_CSS_NVM(cap) ? 'Y' : 'N');
724     zxlogf(INFO, "nvme: subsystem reset supported (NSSRS): %c\n", NVME_CAP_NSSRS(cap) ? 'Y' : 'N');
725     zxlogf(INFO, "nvme: weighted-round-robin (AMS:WRR): %c\n", NVME_CAP_AMS_WRR(cap) ? 'Y' : 'N');
726     zxlogf(INFO, "nvme: vendor-specific arbitration (AMS:VS): %c\n", NVME_CAP_AMS_VS(cap) ? 'Y' : 'N');
727     zxlogf(INFO, "nvme: contiquous queues required (CQR): %c\n", NVME_CAP_CQR(cap) ? 'Y' : 'N');
728     zxlogf(INFO, "nvme: maximum queue entries supported (MQES): %u\n", ((unsigned) NVME_CAP_MQES(cap)) + 1);
729 
730     if ((1 << NVME_CAP_MPSMIN(cap)) > PAGE_SIZE) {
731         zxlogf(ERROR, "nvme: minimum page size larger than platform page size\n");
732         return ZX_ERR_NOT_SUPPORTED;
733     }
734     // allocate pages for various queues and the utxn scatter lists
735     // TODO: these should all be RO to hardware apart from the scratch io page(s)
736     if (io_buffer_init(&nvme->iob, nvme->bti, PAGE_SIZE * IO_PAGE_COUNT, IO_BUFFER_RW) ||
737         io_buffer_physmap(&nvme->iob)) {
738         zxlogf(ERROR, "nvme: could not allocate io buffers\n");
739         return ZX_ERR_NO_MEMORY;
740     }
741 
742     // initialize the microtransaction pool
743     nvme->utxn_avail = 0x7FFFFFFFFFFFFFFFULL;
744     for (unsigned n = 0; n < UTXN_COUNT; n++) {
745         nvme->utxn[n].id = n;
746         nvme->utxn[n].phys = nvme->iob.phys_list[IDX_UTXN_POOL + n];
747         nvme->utxn[n].virt = nvme->iob.virt + (IDX_UTXN_POOL + n) * PAGE_SIZE;
748     }
749 
750     if (rd32(CSTS) & NVME_CSTS_RDY) {
751         zxlogf(INFO, "nvme: controller is active. resetting...\n");
752         wr32(rd32(CC) & ~NVME_CC_EN, CC); // disable
753     }
754 
755     // ensure previous shutdown (by us or bootloader) has completed
756     unsigned ms_remain = WAIT_MS;
757     while (rd32(CSTS) & NVME_CSTS_RDY) {
758         if (--ms_remain == 0) {
759             zxlogf(ERROR, "nvme: timed out waiting for CSTS ~RDY\n");
760             return ZX_ERR_INTERNAL;
761         }
762         zx_nanosleep(zx_deadline_after(ZX_MSEC(1)));
763     }
764 
765     zxlogf(INFO, "nvme: controller inactive. (after %u ms)\n", WAIT_MS - ms_remain);
766 
767     // configure admin submission and completion queues
768     wr64(nvme->iob.phys_list[IDX_ADMIN_SQ], ASQ);
769     wr64(nvme->iob.phys_list[IDX_ADMIN_CQ], ACQ);
770     wr32(NVME_AQA_ASQS(SQMAX - 1) | NVME_AQA_ACQS(CQMAX - 1), AQA);
771 
772     zxlogf(INFO, "nvme: enabling\n");
773     wr32(NVME_CC_EN | NVME_CC_AMS_RR | NVME_CC_MPS(0) |
774          NVME_CC_IOCQES(NVME_CPL_SHIFT) |
775          NVME_CC_IOSQES(NVME_CMD_SHIFT), CC);
776 
777     ms_remain = WAIT_MS;
778     while (!(rd32(CSTS) & NVME_CSTS_RDY)) {
779         if (--ms_remain == 0) {
780             zxlogf(ERROR, "nvme: timed out waiting for CSTS RDY\n");
781             return ZX_ERR_INTERNAL;
782         }
783         zx_nanosleep(zx_deadline_after(ZX_MSEC(1)));
784     }
785     zxlogf(INFO, "nvme: controller ready. (after %u ms)\n", WAIT_MS - ms_remain);
786 
787     // registers and buffers for admin queues
788     nvme->io_admin_sq_tail_db = nvme->mmio.vaddr + NVME_REG_SQnTDBL(0, cap);
789     nvme->io_admin_cq_head_db = nvme->mmio.vaddr + NVME_REG_CQnHDBL(0, cap);
790 
791     nvme->admin_sq = nvme->iob.virt + PAGE_SIZE * IDX_ADMIN_SQ;
792     nvme->admin_sq_head = 0;
793     nvme->admin_sq_tail = 0;
794 
795     nvme->admin_cq = nvme->iob.virt + PAGE_SIZE * IDX_ADMIN_CQ;
796     nvme->admin_cq_head = 0;
797     nvme->admin_cq_toggle = 1;
798 
799     // registers and buffers for IO queues
800     nvme->io_sq_tail_db = nvme->mmio.vaddr + NVME_REG_SQnTDBL(1, cap);
801     nvme->io_cq_head_db = nvme->mmio.vaddr + NVME_REG_CQnHDBL(1, cap);
802 
803     nvme->io_sq = nvme->iob.virt + PAGE_SIZE * IDX_IO_SQ;
804     nvme->io_sq_head = 0;
805     nvme->io_sq_tail = 0;
806 
807     nvme->io_cq = nvme->iob.virt + PAGE_SIZE * IDX_IO_CQ;
808     nvme->io_cq_head = 0;
809     nvme->io_cq_toggle = 1;
810 
811     // scratch page for admin ops
812     void* scratch = nvme->iob.virt + PAGE_SIZE * IDX_SCRATCH;
813 
814     if (thrd_create_with_name(&nvme->irqthread, irq_thread, nvme, "nvme-irq-thread")) {
815         zxlogf(ERROR, "nvme; cannot create irq thread\n");
816         return ZX_ERR_INTERNAL;
817     }
818     nvme->flags |= FLAG_IRQ_THREAD_STARTED;
819 
820     if (thrd_create_with_name(&nvme->iothread, io_thread, nvme, "nvme-io-thread")) {
821         zxlogf(ERROR, "nvme; cannot create io thread\n");
822         return ZX_ERR_INTERNAL;
823     }
824     nvme->flags |= FLAG_IO_THREAD_STARTED;
825 
826     nvme_cmd_t cmd;
827 
828     // identify device
829     cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_IDENTIFY);
830     cmd.nsid = 0;
831     cmd.reserved = 0;
832     cmd.mptr = 0;
833     cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_SCRATCH];
834     cmd.dptr.prp[1] = 0;
835     cmd.u.raw[0] = 1; // CNS 01
836 
837     if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
838         zxlogf(ERROR, "nvme: device identify op failed\n");
839         return ZX_ERR_INTERNAL;
840     }
841 
842     nvme_identify_t* ci = scratch;
843     infostring("model:         ", ci->MN, sizeof(ci->MN));
844     infostring("serial number: ", ci->SN, sizeof(ci->SN));
845     infostring("firmware:      ", ci->FR, sizeof(ci->FR));
846 
847     if ((ci->SQES & 0xF) != NVME_CMD_SHIFT) {
848         zxlogf(ERROR, "nvme: SQES minimum is not %ub\n", NVME_CMD_SIZE);
849         return ZX_ERR_NOT_SUPPORTED;
850     }
851     if ((ci->CQES & 0xF) != NVME_CPL_SHIFT) {
852         zxlogf(ERROR, "nvme: CQES minimum is not %ub\n", NVME_CPL_SIZE);
853         return ZX_ERR_NOT_SUPPORTED;
854     }
855     zxlogf(INFO, "nvme: max outstanding commands: %u\n", ci->MAXCMD);
856 
857     uint32_t nscount = ci->NN;
858     zxlogf(INFO, "nvme: max namespaces: %u\n", nscount);
859     zxlogf(INFO, "nvme: scatter gather lists (SGL): %c %08x\n",
860            (ci->SGLS & 3) ? 'Y' : 'N', ci->SGLS);
861 
862     // Maximum transfer is in units of 2^n * PAGESIZE, n == 0 means "infinite"
863     nvme->max_xfer = 0xFFFFFFFF;
864     if ((ci->MDTS != 0) && (ci->MDTS < (31 - PAGE_SHIFT))) {
865         nvme->max_xfer = (1 << ci->MDTS) * PAGE_SIZE;
866     }
867 
868     zxlogf(INFO, "nvme: max data transfer: %u bytes\n", nvme->max_xfer);
869     zxlogf(INFO, "nvme: sanitize caps: %u\n", ci->SANICAP & 3);
870 
871     zxlogf(INFO, "nvme: abort command limit (ACL): %u\n", ci->ACL + 1);
872     zxlogf(INFO, "nvme: asynch event req limit (AERL): %u\n", ci->AERL + 1);
873     zxlogf(INFO, "nvme: firmware: slots: %u reset: %c slot1ro: %c\n", (ci->FRMW >> 1) & 3,
874            (ci->FRMW & (1 << 4)) ? 'N' : 'Y', (ci->FRMW & 1) ? 'Y' : 'N');
875     zxlogf(INFO, "nvme: host buffer: min/preferred: %u/%u pages\n", ci->HMMIN, ci->HMPRE);
876     zxlogf(INFO, "nvme: capacity: total/unalloc: %zu/%zu\n", ci->TNVMCAP_LO, ci->UNVMCAP_LO);
877 
878     if (ci->VWC & 1) {
879         nvme->flags |= FLAG_HAS_VWC;
880     }
881     uint32_t awun = ci->AWUN + 1;
882     uint32_t awupf = ci->AWUPF + 1;
883     zxlogf(INFO, "nvme: volatile write cache (VWC): %s\n", nvme->flags & FLAG_HAS_VWC ? "Y" : "N");
884     zxlogf(INFO, "nvme: atomic write unit (AWUN)/(AWUPF): %u/%u blks\n", awun, awupf);
885 
886 #define FEATURE(a,b) if (ci->a & a##_##b) zxlogf(INFO, "nvme: feature: %s\n", #b)
887     FEATURE(OACS, DOORBELL_BUFFER_CONFIG);
888     FEATURE(OACS, VIRTUALIZATION_MANAGEMENT);
889     FEATURE(OACS, NVME_MI_SEND_RECV);
890     FEATURE(OACS, DIRECTIVE_SEND_RECV);
891     FEATURE(OACS, DEVICE_SELF_TEST);
892     FEATURE(OACS, NAMESPACE_MANAGEMENT);
893     FEATURE(OACS, FIRMWARE_DOWNLOAD_COMMIT);
894     FEATURE(OACS, FORMAT_NVM);
895     FEATURE(OACS, SECURITY_SEND_RECV);
896     FEATURE(ONCS, TIMESTAMP);
897     FEATURE(ONCS, RESERVATIONS);
898     FEATURE(ONCS, SAVE_SELECT_NONZERO);
899     FEATURE(ONCS, WRITE_UNCORRECTABLE);
900     FEATURE(ONCS, COMPARE);
901 
902     // set feature (number of queues) to 1 iosq and 1 iocq
903     memset(&cmd, 0, sizeof(cmd));
904     cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_SET_FEATURE);
905     cmd.u.raw[0] = NVME_FEATURE_NUMBER_OF_QUEUES;
906     cmd.u.raw[1] = 0;
907 
908     nvme_cpl_t cpl;
909     if (nvme_admin_txn(nvme, &cmd, &cpl) != ZX_OK) {
910         zxlogf(ERROR, "nvme: set feature (number queues) op failed\n");
911         return ZX_ERR_INTERNAL;
912     }
913     zxlogf(INFO,"cpl.cmd %08x\n", cpl.cmd);
914 
915     // create the IO completion queue
916     memset(&cmd, 0, sizeof(cmd));
917     cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_CREATE_IOCQ);
918     cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_IO_CQ];
919     cmd.u.raw[0] = ((CQMAX - 1) << 16) | 1; // queue size, queue id
920     cmd.u.raw[1] = (0 << 16) | 2 | 1; // irq vector, irq enable, phys contig
921 
922     if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
923         zxlogf(ERROR, "nvme: completion queue creation op failed\n");
924         return ZX_ERR_INTERNAL;
925     }
926 
927     // create the IO submit queue
928     memset(&cmd, 0, sizeof(cmd));
929     cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_CREATE_IOSQ);
930     cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_IO_SQ];
931     cmd.u.raw[0] = ((SQMAX - 1) << 16) | 1; // queue size, queue id
932     cmd.u.raw[1] = (1 << 16) | 0 | 1; // cqid, qprio, phys contig
933 
934     if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
935         zxlogf(ERROR, "nvme: submit queue creation op failed\n");
936         return ZX_ERR_INTERNAL;
937     }
938 
939     // identify namespace 1
940     memset(&cmd, 0, sizeof(cmd));
941     cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_IDENTIFY);
942     cmd.nsid = 1;
943     cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_SCRATCH];
944 
945     if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
946         zxlogf(ERROR, "nvme: namespace identify op failed\n");
947         return ZX_ERR_INTERNAL;
948     }
949 
950     nvme_identify_ns_t* ni = scratch;
951 
952     uint32_t nawun = (ni->NSFEAT & NSFEAT_LOCAL_ATOMIC_SIZES) ? (ni->NAWUN + 1U) : awun;
953     uint32_t nawupf = (ni->NSFEAT & NSFEAT_LOCAL_ATOMIC_SIZES) ? (ni->NAWUPF + 1U) : awupf;
954     zxlogf(INFO, "nvme: ns: atomic write unit (AWUN)/(AWUPF): %u/%u blks\n", nawun, nawupf);
955     zxlogf(INFO, "nvme: ns: NABSN/NABO/NABSPF/NOIOB: %u/%u/%u/%u\n",
956            ni->NABSN, ni->NABO, ni->NABSPF, ni->NOIOB);
957 
958     // table of block formats
959     for (unsigned i = 0; i < 16; i++) {
960         if (ni->LBAF[i]) {
961             zxlogf(INFO, "nvme: ns: LBA FMT %02d: RP=%u LBADS=2^%ub MS=%ub\n",
962                     i, NVME_LBAFMT_RP(ni->LBAF[i]), NVME_LBAFMT_LBADS(ni->LBAF[i]),
963                     NVME_LBAFMT_MS(ni->LBAF[i]));
964         }
965     }
966 
967     zxlogf(INFO, "nvme: ns: LBA FMT #%u active\n", ni->FLBAS & 0xF);
968     zxlogf(INFO, "nvme: ns: data protection: caps/set: 0x%02x/%u\n",
969            ni->DPC & 0x3F, ni->DPS & 3);
970 
971     uint32_t fmt = ni->LBAF[ni->FLBAS & 0xF];
972 
973     zxlogf(INFO, "nvme: ns: size/cap/util: %zu/%zu/%zu blks\n", ni->NSSZ, ni->NCAP, ni->NUSE);
974 
975     nvme->info.block_count = ni->NSSZ;
976     nvme->info.block_size = 1 << NVME_LBAFMT_LBADS(fmt);
977     nvme->info.max_transfer_size = BLOCK_MAX_TRANSFER_UNBOUNDED;
978 
979     if (NVME_LBAFMT_MS(fmt)) {
980         zxlogf(ERROR, "nvme: cannot handle LBA format with metadata\n");
981         return ZX_ERR_NOT_SUPPORTED;
982     }
983     if ((nvme->info.block_size < 512) || (nvme->info.block_size > 32768)) {
984         zxlogf(ERROR, "nvme: cannot handle LBA size of %u\n", nvme->info.block_size);
985         return ZX_ERR_NOT_SUPPORTED;
986     }
987 
988     // NVME r/w commands operate in block units, maximum of 64K:
989     size_t max_bytes_per_cmd = ((size_t) nvme->info.block_size) * ((size_t) 65536);
990 
991     if (nvme->max_xfer > max_bytes_per_cmd) {
992         nvme->max_xfer = max_bytes_per_cmd;
993     }
994 
995     // The device may allow transfers larger than we are prepared
996     // to handle.  Clip to our limit.
997     if (nvme->max_xfer > MAX_XFER) {
998         nvme->max_xfer = MAX_XFER;
999     }
1000 
1001     // convert to block units
1002     nvme->max_xfer /= nvme->info.block_size;
1003     zxlogf(INFO, "nvme: max transfer per r/w op: %u blocks (%u bytes)\n",
1004            nvme->max_xfer, nvme->max_xfer * nvme->info.block_size);
1005 
1006     device_make_visible(nvme->zxdev);
1007     return ZX_OK;
1008 }
1009 
1010 block_impl_protocol_ops_t block_ops = {
1011     .query = nvme_query,
1012     .queue = nvme_queue,
1013 };
1014 
nvme_bind(void * ctx,zx_device_t * dev)1015 static zx_status_t nvme_bind(void* ctx, zx_device_t* dev) {
1016     nvme_device_t* nvme;
1017     if ((nvme = calloc(1, sizeof(nvme_device_t))) == NULL) {
1018         return ZX_ERR_NO_MEMORY;
1019     }
1020     list_initialize(&nvme->pending_txns);
1021     list_initialize(&nvme->active_txns);
1022     mtx_init(&nvme->lock, mtx_plain);
1023     mtx_init(&nvme->admin_lock, mtx_plain);
1024 
1025     if (device_get_protocol(dev, ZX_PROTOCOL_PCI, &nvme->pci)) {
1026         goto fail;
1027     }
1028 
1029     if (pci_map_bar_buffer(&nvme->pci, 0u, ZX_CACHE_POLICY_UNCACHED_DEVICE, &nvme->mmio)) {
1030         zxlogf(ERROR, "nvme: cannot map registers\n");
1031         goto fail;
1032     }
1033 
1034     uint32_t modes[3] = {
1035         ZX_PCIE_IRQ_MODE_MSI_X, ZX_PCIE_IRQ_MODE_MSI, ZX_PCIE_IRQ_MODE_LEGACY,
1036     };
1037     uint32_t nirq = 0;
1038     for (unsigned n = 0; n < countof(modes); n++) {
1039         if ((pci_query_irq_mode(&nvme->pci, modes[n], &nirq) == ZX_OK) &&
1040             (pci_set_irq_mode(&nvme->pci, modes[n], 1) == ZX_OK)) {
1041             zxlogf(INFO, "nvme: irq mode %u, irq count %u (#%u)\n", modes[n], nirq, n);
1042             goto irq_configured;
1043         }
1044     }
1045     zxlogf(ERROR, "nvme: could not configure irqs\n");
1046     goto fail;
1047 
1048 irq_configured:
1049     if (pci_map_interrupt(&nvme->pci, 0, &nvme->irqh) != ZX_OK) {
1050         zxlogf(ERROR, "nvme: could not map irq\n");
1051         goto fail;
1052     }
1053     if (pci_enable_bus_master(&nvme->pci, true)) {
1054         zxlogf(ERROR, "nvme: cannot enable bus mastering\n");
1055         goto fail;
1056     }
1057     if (pci_get_bti(&nvme->pci, 0, &nvme->bti) != ZX_OK) {
1058         zxlogf(ERROR, "nvme: cannot obtain bti handle\n");
1059         goto fail;
1060     }
1061 
1062     device_add_args_t args = {
1063         .version = DEVICE_ADD_ARGS_VERSION,
1064         .name = "nvme",
1065         .ctx = nvme,
1066         .ops = &device_ops,
1067         .flags = DEVICE_ADD_INVISIBLE,
1068         .proto_id = ZX_PROTOCOL_BLOCK_IMPL,
1069         .proto_ops = &block_ops,
1070     };
1071 
1072     if (device_add(dev, &args, &nvme->zxdev)) {
1073         goto fail;
1074     }
1075 
1076     if (nvme_init(nvme) != ZX_OK) {
1077         zxlogf(ERROR, "nvme: init failed\n");
1078         device_remove(nvme->zxdev);
1079         return ZX_ERR_INTERNAL;
1080     }
1081 
1082     return ZX_OK;
1083 
1084 fail:
1085     nvme_release(nvme);
1086     return ZX_ERR_NOT_SUPPORTED;
1087 }
1088 
1089 static zx_driver_ops_t driver_ops = {
1090     .version = DRIVER_OPS_VERSION,
1091     .bind = nvme_bind,
1092 };
1093 
1094 ZIRCON_DRIVER_BEGIN(nvme, driver_ops, "zircon", "0.1", 4)
1095     BI_ABORT_IF(NE, BIND_PROTOCOL, ZX_PROTOCOL_PCI),
1096     BI_ABORT_IF(NE, BIND_PCI_CLASS, 1), // Mass Storage
1097     BI_ABORT_IF(NE, BIND_PCI_SUBCLASS, 8), // NVM
1098     BI_MATCH_IF(EQ, BIND_PCI_INTERFACE, 2), // NVMHCI
1099 ZIRCON_DRIVER_END(nvme)
1100