1 /*
2  * Copyright (c) 2006-2023, RT-Thread Development Team
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Change Logs:
7  * Date           Author       Notes
8  * 2023-02-25     GuEe-GUI     the first version
9  */
10 
11 #include <rthw.h>
12 #include <rtthread.h>
13 #include <rtdevice.h>
14 
15 #define DBG_TAG "rtdm.nvme"
16 #define DBG_LVL DBG_INFO
17 #include <rtdbg.h>
18 
19 static struct rt_dm_ida nvme_controller_ida = RT_DM_IDA_INIT(CUSTOM);
20 static struct rt_dm_ida nvme_ida = RT_DM_IDA_INIT(NVME);
21 
22 static RT_DEFINE_SPINLOCK(nvme_lock);
23 static rt_list_t nvme_nodes = RT_LIST_OBJECT_INIT(nvme_nodes);
24 
nvme_readl(struct rt_nvme_controller * nvme,int offset)25 rt_inline rt_uint32_t nvme_readl(struct rt_nvme_controller *nvme, int offset)
26 {
27     return HWREG32(nvme->regs + offset);
28 }
29 
nvme_writel(struct rt_nvme_controller * nvme,int offset,rt_uint32_t value)30 rt_inline void nvme_writel(struct rt_nvme_controller *nvme, int offset, rt_uint32_t value)
31 {
32     HWREG32(nvme->regs + offset) = value;
33 }
34 
nvme_readq(struct rt_nvme_controller * nvme,int offset)35 rt_inline rt_uint64_t nvme_readq(struct rt_nvme_controller *nvme, int offset)
36 {
37     rt_uint32_t lo32, hi32;
38 
39     lo32 = HWREG32(nvme->regs + offset);
40     hi32 = HWREG32(nvme->regs + offset + 4);
41 
42     return ((rt_uint64_t)hi32 << 32) + lo32;
43 }
44 
nvme_writeq(struct rt_nvme_controller * nvme,int offset,rt_uint64_t value)45 rt_inline void nvme_writeq(struct rt_nvme_controller *nvme, int offset, rt_uint64_t value)
46 {
47     nvme_writel(nvme, offset, (rt_uint32_t)(value & 0xffffffff));
48     nvme_writel(nvme, offset + 4, (rt_uint32_t)(value >> 32));
49 }
50 
nvme_poll_csts(struct rt_nvme_controller * nvme,rt_uint32_t mask,rt_uint32_t value)51 static rt_err_t nvme_poll_csts(struct rt_nvme_controller *nvme,
52         rt_uint32_t mask, rt_uint32_t value)
53 {
54     rt_tick_t timeout;
55 
56     timeout = rt_tick_from_millisecond(RT_NVME_CAP_TIMEOUT(nvme->cap) * 500);
57     timeout += rt_tick_get();
58 
59     do {
60         if ((nvme_readl(nvme, RT_NVME_REG_CSTS) & mask) == value)
61         {
62             return RT_EOK;
63         }
64 
65         rt_hw_cpu_relax();
66     } while (rt_tick_get() < timeout);
67 
68     return -RT_ETIMEOUT;
69 }
70 
nvme_enable_ctrl(struct rt_nvme_controller * nvme)71 static rt_err_t nvme_enable_ctrl(struct rt_nvme_controller *nvme)
72 {
73     nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK;
74     nvme->ctrl_config |= RT_NVME_CC_ENABLE;
75     nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config);
76 
77     return nvme_poll_csts(nvme, RT_NVME_CSTS_RDY, RT_NVME_CSTS_RDY);
78 }
79 
nvme_disable_ctrl(struct rt_nvme_controller * nvme)80 static rt_err_t nvme_disable_ctrl(struct rt_nvme_controller *nvme)
81 {
82     nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK;
83     nvme->ctrl_config &= ~RT_NVME_CC_ENABLE;
84     nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config);
85 
86     return nvme_poll_csts(nvme, RT_NVME_CSTS_RDY, 0);
87 }
88 
nvme_shutdown_ctrl(struct rt_nvme_controller * nvme)89 static rt_err_t nvme_shutdown_ctrl(struct rt_nvme_controller *nvme)
90 {
91     nvme->ctrl_config &= ~RT_NVME_CC_SHN_MASK;
92     nvme->ctrl_config |= RT_NVME_CC_SHN_NORMAL;
93     nvme_writel(nvme, RT_NVME_REG_CC, nvme->ctrl_config);
94 
95     return nvme_poll_csts(nvme, RT_NVME_CSTS_SHST_MASK, RT_NVME_CSTS_SHST_CMPLT);
96 }
97 
nvme_next_cmdid(struct rt_nvme_controller * nvme)98 rt_inline rt_le16_t nvme_next_cmdid(struct rt_nvme_controller *nvme)
99 {
100     return rt_cpu_to_le16((rt_uint16_t)rt_atomic_add(&nvme->cmdid, 1));
101 }
102 
nvme_submit_cmd(struct rt_nvme_queue * queue,struct rt_nvme_command * cmd)103 static rt_err_t nvme_submit_cmd(struct rt_nvme_queue *queue,
104         struct rt_nvme_command *cmd)
105 {
106     rt_ubase_t level;
107     rt_err_t err = RT_EOK;
108     rt_uint16_t tail, head;
109     struct rt_nvme_controller *nvme = queue->nvme;
110 
111 _retry:
112     level = rt_spin_lock_irqsave(&queue->lock);
113 
114     tail = queue->sq_tail;
115     head = queue->cq_head;
116 
117     if (tail + 1 == head)
118     {
119         /* IO queue is full, waiting for the last IO command to complete. */
120         rt_spin_unlock_irqrestore(&queue->lock, level);
121 
122         rt_thread_yield();
123 
124         goto _retry;
125     }
126 
127     cmd->common.cmdid = nvme_next_cmdid(nvme);
128     rt_memcpy(&queue->sq_cmds[tail], cmd, sizeof(*cmd));
129 
130     if (nvme->ops->submit_cmd)
131     {
132         if ((err = nvme->ops->submit_cmd(queue, cmd)))
133         {
134             return err;
135         }
136     }
137 
138     if (++tail == queue->depth)
139     {
140         tail = 0;
141     }
142     HWREG32(queue->doorbell) = tail;
143     queue->sq_tail = tail;
144 
145     queue->cmd = cmd;
146     queue->err = RT_EOK;
147 
148     rt_spin_unlock_irqrestore(&queue->lock, level);
149 
150     err = rt_completion_wait(&queue->done,
151             rt_tick_from_millisecond(queue->qid != 0 ? RT_WAITING_FOREVER : 60));
152 
153     return err ? : queue->err;
154 }
155 
nvme_set_features_simple(struct rt_nvme_controller * nvme,rt_uint32_t fid,rt_uint32_t dword11)156 static rt_err_t nvme_set_features_simple(struct rt_nvme_controller *nvme,
157         rt_uint32_t fid, rt_uint32_t dword11)
158 {
159     struct rt_nvme_command cmd;
160 
161     rt_memset(&cmd, 0, sizeof(cmd));
162     cmd.features.opcode = RT_NVME_ADMIN_OPCODE_SET_FEATURES;
163     cmd.features.fid = rt_cpu_to_le32(fid);
164     cmd.features.dword11 = rt_cpu_to_le32(dword11);
165 
166     return nvme_submit_cmd(&nvme->admin_queue, &cmd);
167 }
168 
nvme_submit_io_cmd(struct rt_nvme_controller * nvme,struct rt_nvme_command * cmd)169 static rt_err_t nvme_submit_io_cmd(struct rt_nvme_controller *nvme,
170         struct rt_nvme_command *cmd)
171 {
172     rt_uint16_t qid;
173 
174     qid = rt_atomic_add(&nvme->ioqid[rt_hw_cpu_id()], RT_CPUS_NR);
175     qid %= nvme->io_queue_max;
176 
177     return nvme_submit_cmd(&nvme->io_queues[qid], cmd);
178 }
179 
180 /*
181  * PRP Mode:
182  *
183  * |63                                   n+1|n                0|
184  * +----------------------------------------+----------+---+---+
185  * |            Page Base Address           |  Offset  | 0 | 0 |
186  * +----------------------------------------+----------+---+---+
187  *                                                             |
188  *                                                             v
189  *                                            Host Physical Pages
190  *                                       +----------------------------+
191  * +--------------+----------+           |           Page k           |
192  * |  PRP Entry1  |  Offset  +---------->+----------------------------+
193  * +--------------+----------+           |         Page k + 1         |
194  *                                       +----------------------------+
195  *                                                     ...
196  *                                       +----------------------------+
197  * +--------------+----------+           |         Page k + m         |
198  * |  PRP Entry2  |    0     +---------->+----------------------------+
199  * +--------------+----------+           |       Page k + m + 1       |
200  *                                       +----------------------------+
201  * PRP List (In PRP Entry2):
202  *
203  * |63                                   n+1|n                0|
204  * +----------------------------------------+------------------+
205  * |           Page Base Address k          |        0h        |
206  * +----------------------------------------+------------------+
207  * |        Page Base Address k + 1         |        0h        |
208  * +----------------------------------------+------------------+
209  * |                            ...                            |
210  * +----------------------------------------+------------------+
211  * |        Page Base Address k + m         |        0h        |
212  * +----------------------------------------+------------------+
213  * |       Page Base Address k + m + 1      |        0h        |
214  * +----------------------------------------+------------------+
215  *
216  * SGL Mode:
217  *                                           +----- Non-transport
218  * LBA                                      /
219  * +---------------+---------------+-------/-------+---------------+
220  * |      3KB      |      4KB      |      2KB      |      4KB      |
221  * +-------+-------+-------+-------+---------------+--------+------+
222  *         |               +-------------------------+      |
223  *         |                                         |      |
224  *         |                    +--------------------|------+
225  *         |                    |                    |
226  * +-------v-------+    +-------v-------+    +-------v-------+
227  * |  A MEM BLOCK  |    |  B MEM BLOCK  |    |  C MEM BLOCK  |
228  * +-------^-------+    +-------^-------+    +-------^-------+
229  *         |                    |                    |
230  *         +----------------+   |                    |
231  *                          |   |                    |
232  * Segment(0)               |   |                    |
233  * +----------+----------+  |   |                    |
234  * | Address: A          +--+   |                    |
235  * +----------+----------+      |                    |
236  * | Type: 0h | Len: 3KB |      |                    |
237  * +----------+----------+      |                    |
238  * | Address: Segment(1) +--+   |                    |
239  * +----------+----------+  |   |                    |
240  * | Type: 2h | Len: 48  |  |   |                    |
241  * +----------+----------+  |   |                    |
242  *                          |   |                    |
243  * +------------------------+   |                    |
244  * |                            |                    |
245  * v                            |                    |
246  * Segment(1)                   |                    |
247  * +----------+----------+      |                    |
248  * | Address: B          +------+                    |
249  * +----------+----------+                           |
250  * | Type: 0h | Len: 4KB |                           |
251  * +----------+----------+                           |
252  * | Address: <NULL>     |                           |
253  * +----------+----------+                           |
254  * | Type: 1h | Len: 2KB |                           |
255  * +----------+----------+                           |
256  * | Address: Segment(2) +--+                        |
257  * +----------+----------+  |                        |
258  * | Type: 0h | Len: 16  |  |                        |
259  * +----------+----------+  |                        |
260  *                          |                        |
261  * +------------------------+                        |
262  * |                                                 |
263  * v                                                 |
264  * Segment(2)                                        |
265  * +----------+----------+                           |
266  * | Address: C          +---------------------------+
267  * +----------+----------+
268  * | Type: 0h | Len: 4KB |
269  * +----------+----------+
270  */
271 
nvme_blk_rw(struct rt_nvme_device * ndev,rt_off_t slba,rt_ubase_t buffer_dma,rt_size_t lbas,rt_uint8_t opcode)272 static rt_ssize_t nvme_blk_rw(struct rt_nvme_device *ndev, rt_off_t slba,
273         rt_ubase_t buffer_dma, rt_size_t lbas, rt_uint8_t opcode)
274 {
275     rt_err_t err;
276     rt_uint16_t max_lbas;
277     rt_uint32_t lba_shift;
278     rt_size_t tlbas;
279     rt_ssize_t data_length;
280     struct rt_nvme_command cmd;
281     struct rt_nvme_controller *nvme = ndev->ctrl;
282 
283     rt_memset(&cmd, 0, sizeof(cmd));
284     cmd.rw.opcode = opcode;
285     cmd.rw.flags = nvme->sgl_mode << RT_NVME_CMD_FLAGS_PSDT_SHIFT;
286     cmd.rw.nsid = rt_cpu_to_le32(ndev->nsid);
287 
288     tlbas = lbas;
289     lba_shift = ndev->lba_shift;
290     max_lbas = 1 << (nvme->max_transfer_shift - lba_shift);
291 
292     if (nvme->sgl_mode)
293     {
294         while ((rt_ssize_t)lbas > 0)
295         {
296             if (lbas < max_lbas)
297             {
298                 max_lbas = (rt_uint16_t)lbas;
299             }
300 
301             data_length = max_lbas << lba_shift;
302 
303             cmd.rw.sgl.adddress = rt_cpu_to_le64(buffer_dma);
304             cmd.rw.sgl.length = rt_cpu_to_le32(data_length);
305             cmd.rw.sgl.sgl_identify = SGL_DESC_TYPE_DATA_BLOCK;
306             cmd.rw.slba = rt_cpu_to_le16(slba);
307             cmd.rw.length = rt_cpu_to_le16(max_lbas - 1);
308 
309             if ((err = nvme_submit_io_cmd(nvme, &cmd)))
310             {
311                 tlbas -= lbas;
312                 break;
313             }
314 
315             lbas -= max_lbas;
316             slba += max_lbas;
317             buffer_dma += data_length;
318         }
319     }
320     else
321     {
322         void *prp_list = RT_NULL;
323         rt_size_t prp_list_size = 0, page_size;
324 
325         page_size = nvme->page_size;
326 
327         while ((rt_ssize_t)lbas > 0)
328         {
329             rt_uint64_t prp2_addr, dma_addr;
330             rt_ssize_t remain_length, page_offset;
331 
332             if (lbas < max_lbas)
333             {
334                 max_lbas = (rt_uint16_t)lbas;
335             }
336 
337             /*
338              * PRP transfer:
339              *  1. data_length <= 4KB:
340              *      prp1 = buffer_dma
341              *      prp2 = 0
342              *
343              *  2. 4KB < data_length <= 8KB:
344              *      prp1 = buffer_dma
345              *      prp2 = buffer_dma
346              *
347              *  3. 8KB < data_length:
348              *      prp1 = buffer_dma(0, 4k)
349              *      prp2 = buffer_dma(4k, ~)
350              */
351             dma_addr = buffer_dma;
352             page_offset = buffer_dma & (page_size - 1);
353             data_length = max_lbas << lba_shift;
354             remain_length = data_length - (page_size - page_offset);
355 
356             do {
357                 rt_size_t prps_per_page, prps, pages;
358                 rt_uint64_t *prp_list_ptr, prp_list_dma;
359 
360                 if (remain_length <= 0)
361                 {
362                     prp2_addr = 0;
363                     break;
364                 }
365 
366                 if (remain_length)
367                 {
368                     dma_addr += (page_size - page_offset);
369                 }
370 
371                 if (remain_length <= page_size)
372                 {
373                     prp2_addr = dma_addr;
374                     break;
375                 }
376 
377                 prps_per_page = page_size / sizeof(rt_uint64_t);
378                 prps = RT_DIV_ROUND_UP(remain_length, page_size);
379                 pages = RT_DIV_ROUND_UP(prps - 1, prps_per_page - 1);
380 
381                 if (prps > prp_list_size)
382                 {
383                     if (prp_list)
384                     {
385                         rt_free_align(prp_list);
386                     }
387 
388                     prp_list = rt_malloc_align(pages * page_size, page_size);
389 
390                     if (!prp_list)
391                     {
392                         LOG_D("No memory to create a PRP List");
393                         /* Ask user to try again */
394                         return tlbas - lbas;
395                     }
396 
397                     prp_list_size = pages * (prps_per_page - 1) + 1;
398                 }
399                 prp_list_ptr = prp_list;
400                 prp_list_dma = (rt_uint64_t)rt_kmem_v2p(prp_list_ptr);
401 
402                 prp2_addr = prp_list_dma;
403 
404                 for (int i = 0; prps; --prps, ++i)
405                 {
406                     /* End of the entry, fill the next entry addr if remain */
407                     if ((i == (prps_per_page - 1)) && prps > 1)
408                     {
409                         prp_list_dma += page_size;
410                         *prp_list_ptr = rt_cpu_to_le64(prp_list_dma);
411 
412                         /* Start to fill the next PRP */
413                         i = 0;
414                     }
415 
416                     *prp_list_ptr = rt_cpu_to_le64(dma_addr);
417                     dma_addr += page_size;
418                 }
419 
420                 rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, prp_list_ptr, prp_list_size);
421             } while (0);
422 
423             cmd.rw.prp1 = rt_cpu_to_le64(buffer_dma);
424             cmd.rw.prp2 = rt_cpu_to_le64(prp2_addr);
425             cmd.rw.slba = rt_cpu_to_le16(slba);
426             cmd.rw.length = rt_cpu_to_le16(max_lbas - 1);
427 
428             if ((err = nvme_submit_io_cmd(nvme, &cmd)))
429             {
430                 tlbas -= lbas;
431                 break;
432             }
433 
434             lbas -= max_lbas;
435             slba += max_lbas;
436             buffer_dma += data_length;
437         }
438 
439         if (prp_list)
440         {
441             rt_free_align(prp_list);
442         }
443     }
444 
445     return tlbas;
446 }
447 
nvme_blk_read(struct rt_blk_disk * disk,rt_off_t sector,void * buffer,rt_size_t sector_count)448 static rt_ssize_t nvme_blk_read(struct rt_blk_disk *disk, rt_off_t sector,
449         void *buffer, rt_size_t sector_count)
450 {
451     rt_ssize_t res;
452     rt_uint32_t page_bits;
453     rt_size_t buffer_size;
454     rt_ubase_t buffer_dma;
455     void *temp_buffer = RT_NULL;
456     struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
457     struct rt_nvme_controller *nvme = ndev->ctrl;
458 
459     buffer_size = (1 << ndev->lba_shift) * sector_count;
460     buffer_dma = (rt_ubase_t)rt_kmem_v2p(buffer);
461 
462     if ((nvme->sgl_mode && (buffer_dma & RT_GENMASK(1, 0))) ||
463         (!nvme->sgl_mode && (buffer_dma & ARCH_PAGE_MASK)))
464     {
465         LOG_D("DMA PRP direct %s buffer MUST 4-bytes or page aligned", "read");
466 
467         page_bits = rt_page_bits(buffer_size);
468         temp_buffer = rt_pages_alloc(page_bits);
469 
470         if (!temp_buffer)
471         {
472             return -RT_ENOMEM;
473         }
474 
475         buffer_dma = (rt_ubase_t)rt_kmem_v2p(temp_buffer);
476     }
477 
478     res = nvme_blk_rw(ndev, sector, buffer_dma, sector_count, RT_NVME_CMD_READ);
479 
480     if (res > 0)
481     {
482         if (res != sector_count)
483         {
484             /*
485              * Don't always aim for optimization, checking for equality
486              * is much faster than multiplication calculation.
487              */
488             buffer_size = res * (1 << ndev->lba_shift);
489         }
490 
491         if (temp_buffer)
492         {
493             rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, temp_buffer, buffer_size);
494             rt_memcpy(buffer, temp_buffer, buffer_size);
495         }
496         else
497         {
498             rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, buffer, buffer_size);
499         }
500     }
501 
502     if (temp_buffer)
503     {
504         rt_pages_free(temp_buffer, page_bits);
505     }
506 
507     return res;
508 }
509 
nvme_blk_write(struct rt_blk_disk * disk,rt_off_t sector,const void * buffer,rt_size_t sector_count)510 static rt_ssize_t nvme_blk_write(struct rt_blk_disk *disk, rt_off_t sector,
511         const void *buffer, rt_size_t sector_count)
512 {
513     rt_ssize_t res;
514     rt_uint32_t page_bits;
515     rt_size_t buffer_size;
516     rt_ubase_t buffer_dma;
517     void *temp_buffer = RT_NULL;
518     struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
519     struct rt_nvme_controller *nvme = ndev->ctrl;
520 
521     buffer_size = (1 << ndev->lba_shift) * sector_count;
522     buffer_dma = (rt_ubase_t)rt_kmem_v2p((void *)buffer);
523 
524     if ((nvme->sgl_mode && (buffer_dma & RT_GENMASK(1, 0))) ||
525         (!nvme->sgl_mode && (buffer_dma & ARCH_PAGE_MASK)))
526     {
527         LOG_D("DMA PRP direct %s buffer MUST 4-bytes or page aligned", "write");
528 
529         page_bits = rt_page_bits(buffer_size);
530         temp_buffer = rt_pages_alloc(page_bits);
531 
532         if (!temp_buffer)
533         {
534             return -RT_ENOMEM;
535         }
536 
537         buffer_dma = (rt_ubase_t)rt_kmem_v2p(temp_buffer);
538 
539         rt_memcpy(temp_buffer, buffer, buffer_size);
540         buffer = temp_buffer;
541     }
542 
543     rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, (void *)buffer, buffer_size);
544 
545     res = nvme_blk_rw(ndev, sector, buffer_dma, sector_count, RT_NVME_CMD_WRITE);
546 
547     if (temp_buffer)
548     {
549         rt_pages_free(temp_buffer, page_bits);
550     }
551 
552     return res;
553 }
554 
nvme_blk_getgeome(struct rt_blk_disk * disk,struct rt_device_blk_geometry * geometry)555 static rt_err_t nvme_blk_getgeome(struct rt_blk_disk *disk,
556         struct rt_device_blk_geometry *geometry)
557 {
558     struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
559 
560     geometry->bytes_per_sector = 1 << ndev->lba_shift;
561     geometry->block_size = 1 << ndev->lba_shift;
562     geometry->sector_count = rt_le64_to_cpu(ndev->id.nsze);
563 
564     return RT_EOK;
565 }
566 
nvme_blk_sync(struct rt_blk_disk * disk)567 static rt_err_t nvme_blk_sync(struct rt_blk_disk *disk)
568 {
569     struct rt_nvme_command cmd;
570     struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
571 
572     rt_memset(&cmd, 0, sizeof(cmd));
573     cmd.common.opcode = RT_NVME_CMD_FLUSH;
574     cmd.common.nsid = rt_cpu_to_le32(ndev->nsid);
575 
576     return nvme_submit_io_cmd(ndev->ctrl, &cmd);
577 }
578 
nvme_blk_erase(struct rt_blk_disk * disk)579 static rt_err_t nvme_blk_erase(struct rt_blk_disk *disk)
580 {
581     rt_err_t err = RT_EOK;
582     rt_ssize_t slba, lbas, max_lbas;
583     struct rt_nvme_command cmd;
584     struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
585     struct rt_nvme_controller *nvme = ndev->ctrl;
586 
587     if (!nvme->write_zeroes)
588     {
589         return -RT_ENOSYS;
590     }
591 
592     rt_memset(&cmd, 0, sizeof(cmd));
593     cmd.write_zeroes.opcode = RT_NVME_CMD_WRITE_ZEROES;
594     cmd.write_zeroes.nsid = rt_cpu_to_le32(ndev->nsid);
595 
596     slba = 0;
597     lbas = rt_le64_to_cpu(ndev->id.nsze);
598     max_lbas = 1 << (nvme->max_transfer_shift - ndev->lba_shift);
599 
600     while ((rt_ssize_t)lbas > 0)
601     {
602         if (lbas < max_lbas)
603         {
604             max_lbas = (rt_uint16_t)lbas;
605         }
606 
607         cmd.write_zeroes.slba = rt_cpu_to_le16(slba);
608         cmd.write_zeroes.length = rt_cpu_to_le16(max_lbas - 1);
609 
610         if ((err = nvme_submit_io_cmd(nvme, &cmd)))
611         {
612             break;
613         }
614 
615         lbas -= max_lbas;
616         slba += max_lbas;
617     }
618 
619     return err;
620 }
621 
nvme_blk_autorefresh(struct rt_blk_disk * disk,rt_bool_t is_auto)622 static rt_err_t nvme_blk_autorefresh(struct rt_blk_disk *disk, rt_bool_t is_auto)
623 {
624     struct rt_nvme_device *ndev = rt_disk_to_nvme_device(disk);
625     struct rt_nvme_controller *nvme = ndev->ctrl;
626 
627     if (nvme->volatile_write_cache & RT_NVME_CTRL_VWC_PRESENT)
628     {
629         return nvme_set_features_simple(nvme, RT_NVME_FEAT_VOLATILE_WC, !!is_auto);
630     }
631     else if (!is_auto)
632     {
633         return RT_EOK;
634     }
635 
636     return -RT_ENOSYS;
637 }
638 
639 static const struct rt_blk_disk_ops nvme_blk_ops =
640 {
641     .read = nvme_blk_read,
642     .write = nvme_blk_write,
643     .getgeome = nvme_blk_getgeome,
644     .sync = nvme_blk_sync,
645     .erase = nvme_blk_erase,
646     .autorefresh = nvme_blk_autorefresh,
647 };
648 
nvme_queue_isr(int irqno,void * param)649 static void nvme_queue_isr(int irqno, void *param)
650 {
651     rt_ubase_t level;
652     rt_uint16_t head, phase, status;
653     struct rt_nvme_queue *queue = param;
654     struct rt_nvme_controller *nvme = queue->nvme;
655 
656     level = rt_spin_lock_irqsave(&queue->lock);
657 
658     head = queue->cq_head;
659     phase = queue->cq_phase;
660     status = HWREG16(&queue->cq_entry[head].status);
661     status = rt_le16_to_cpu(status);
662 
663     if ((status & 0x01) == phase)
664     {
665         if ((status >> 1))
666         {
667             queue->err = -RT_EIO;
668             goto _end_cmd;
669         }
670 
671         if (nvme->ops->complete_cmd)
672         {
673             nvme->ops->complete_cmd(queue, queue->cmd);
674         }
675 
676     _end_cmd:
677         if (++head == queue->depth)
678         {
679             head = 0;
680             phase = !phase;
681         }
682 
683         HWREG32(queue->doorbell + nvme->doorbell_stride) = head;
684         queue->cq_head = head;
685         queue->cq_phase = phase;
686 
687         rt_completion_done(&queue->done);
688     }
689 
690     rt_spin_unlock_irqrestore(&queue->lock, level);
691 }
692 
nvme_identify(struct rt_nvme_controller * nvme,rt_uint32_t nsid,rt_uint32_t cns,void * data)693 static rt_err_t nvme_identify(struct rt_nvme_controller *nvme,
694         rt_uint32_t nsid, rt_uint32_t cns, void *data)
695 {
696     rt_err_t err;
697     rt_uint32_t page_size = nvme->page_size;
698     rt_ubase_t data_phy = (rt_ubase_t)rt_kmem_v2p(data);
699     int offset = data_phy & (page_size - 1);
700     struct rt_nvme_command cmd;
701 
702     rt_memset(&cmd, 0, sizeof(cmd));
703     cmd.identify.opcode = RT_NVME_ADMIN_OPCODE_IDENTIFY;
704     cmd.identify.nsid = rt_cpu_to_le32(nsid);
705     cmd.identify.prp1 = rt_cpu_to_le64(data_phy);
706 
707     if (sizeof(struct rt_nvme_id_ctrl) <= page_size - offset)
708     {
709         cmd.identify.prp2 = 0;
710     }
711     else
712     {
713         data_phy += (page_size - offset);
714         cmd.identify.prp2 = rt_cpu_to_le64(data_phy);
715     }
716     cmd.identify.cns = rt_cpu_to_le32(cns);
717 
718     rt_hw_cpu_dcache_ops(RT_HW_CACHE_FLUSH, data, sizeof(struct rt_nvme_id_ctrl));
719 
720     if (!(err = nvme_submit_cmd(&nvme->admin_queue, &cmd)))
721     {
722         rt_hw_cpu_dcache_ops(RT_HW_CACHE_INVALIDATE, data, sizeof(struct rt_nvme_id_ctrl));
723     }
724 
725     return err;
726 }
727 
nvme_attach_queue(struct rt_nvme_queue * queue,rt_uint8_t opcode)728 static rt_err_t nvme_attach_queue(struct rt_nvme_queue *queue, rt_uint8_t opcode)
729 {
730     struct rt_nvme_command cmd;
731     struct rt_nvme_controller *nvme = queue->nvme;
732     rt_uint16_t flags = RT_NVME_QUEUE_PHYS_CONTIG;
733 
734     rt_memset(&cmd, 0, sizeof(cmd));
735 
736     if (opcode == RT_NVME_ADMIN_OPCODE_CREATE_CQ)
737     {
738         cmd.create_cq.opcode = opcode;
739         cmd.create_cq.prp1 = rt_cpu_to_le64(queue->cq_entry_phy);
740         cmd.create_cq.cqid = rt_cpu_to_le16(queue->qid);
741         cmd.create_cq.qsize = rt_cpu_to_le16(queue->depth - 1);
742         cmd.create_cq.cq_flags = rt_cpu_to_le16(flags | RT_NVME_CQ_IRQ_ENABLED);
743         cmd.create_cq.irq_vector = rt_cpu_to_le16(nvme->irqs_nr > 1 ? queue->qid : 0);
744     }
745     else if (opcode == RT_NVME_ADMIN_OPCODE_CREATE_SQ)
746     {
747         cmd.create_sq.opcode = opcode;
748         cmd.create_sq.prp1 = rt_cpu_to_le64(queue->sq_cmds_phy);
749         cmd.create_sq.sqid = rt_cpu_to_le16(queue->qid);
750         cmd.create_sq.qsize = rt_cpu_to_le16(queue->depth - 1);
751         cmd.create_sq.sq_flags = rt_cpu_to_le16(flags | RT_NVME_SQ_PRIO_MEDIUM);
752         cmd.create_sq.cqid = rt_cpu_to_le16(queue->qid);
753     }
754     else
755     {
756         LOG_E("What the fuck opcode = %x", opcode);
757         RT_ASSERT(0);
758     }
759 
760     return nvme_submit_cmd(&nvme->admin_queue, &cmd);
761 }
762 
nvme_attach_queue_sq(struct rt_nvme_queue * queue)763 rt_inline rt_err_t nvme_attach_queue_sq(struct rt_nvme_queue *queue)
764 {
765     return nvme_attach_queue(queue, RT_NVME_ADMIN_OPCODE_CREATE_SQ);
766 }
767 
nvme_attach_queue_cq(struct rt_nvme_queue * queue)768 rt_inline rt_err_t nvme_attach_queue_cq(struct rt_nvme_queue *queue)
769 {
770     return nvme_attach_queue(queue, RT_NVME_ADMIN_OPCODE_CREATE_CQ);
771 }
772 
nvme_detach_queue(struct rt_nvme_queue * queue,rt_uint8_t opcode)773 static rt_err_t nvme_detach_queue(struct rt_nvme_queue *queue,
774         rt_uint8_t opcode)
775 {
776     struct rt_nvme_command cmd;
777     struct rt_nvme_controller *nvme = queue->nvme;
778 
779     rt_memset(&cmd, 0, sizeof(cmd));
780     cmd.delete_queue.opcode = opcode;
781     cmd.delete_queue.qid = rt_cpu_to_le16(queue->qid);
782 
783     return nvme_submit_cmd(&nvme->admin_queue, &cmd);
784 }
785 
nvme_queue_dma_flags(void)786 rt_inline rt_ubase_t nvme_queue_dma_flags(void)
787 {
788     return RT_DMA_F_NOCACHE | RT_DMA_F_LINEAR;
789 }
790 
nvme_free_queue(struct rt_nvme_queue * queue)791 static void nvme_free_queue(struct rt_nvme_queue *queue)
792 {
793     rt_ubase_t dma_flags;
794     struct rt_nvme_controller *nvme = queue->nvme;
795 
796     if (nvme->ops->cleanup_queue)
797     {
798         rt_err_t err;
799 
800         if (!(err = nvme->ops->cleanup_queue(queue)))
801         {
802             LOG_W("Cleanup[%s] queue error = %s", nvme->ops->name, rt_strerror(err));
803         }
804     }
805 
806     dma_flags = nvme_queue_dma_flags();
807 
808     if (queue->sq_cmds)
809     {
810         rt_dma_free(nvme->dev, sizeof(*queue->sq_cmds) * queue->depth,
811                 queue->sq_cmds, queue->sq_cmds_phy, dma_flags);
812     }
813 
814     if (queue->cq_entry)
815     {
816         rt_dma_free(nvme->dev, sizeof(*queue->cq_entry) * queue->depth,
817                 queue->cq_entry, queue->cq_entry_phy, dma_flags);
818     }
819 }
820 
nvme_alloc_queue(struct rt_nvme_controller * nvme,int qid,int depth)821 static struct rt_nvme_queue *nvme_alloc_queue(struct rt_nvme_controller *nvme,
822         int qid, int depth)
823 {
824     rt_err_t err;
825     rt_ubase_t dma_flags;
826     struct rt_nvme_queue *queue = &nvme->queue[qid];
827 
828     rt_memset(queue, 0, sizeof(*queue));
829 
830     queue->nvme = nvme;
831     queue->doorbell = &nvme->doorbell_tbl[qid * 2 * nvme->doorbell_stride];
832     queue->qid = qid;
833     queue->depth = depth;
834     queue->cq_head = 0;
835     queue->cq_phase = 1;
836     rt_completion_init(&queue->done);
837     rt_spin_lock_init(&queue->lock);
838 
839     dma_flags = nvme_queue_dma_flags();
840 
841     /* struct rt_nvme_command */
842     queue->sq_cmds = rt_dma_alloc(nvme->dev,
843             sizeof(*queue->sq_cmds) * depth, &queue->sq_cmds_phy, dma_flags);
844 
845     if (!queue->sq_cmds)
846     {
847         err = -RT_ENOMEM;
848         goto _fail;
849     }
850 
851     /* struct rt_nvme_completion */
852     queue->cq_entry = rt_dma_alloc(nvme->dev,
853             sizeof(*queue->cq_entry) * depth, &queue->cq_entry_phy, dma_flags);
854 
855     if (!queue->cq_entry)
856     {
857         err = -RT_ENOMEM;
858         goto _fail;
859     }
860 
861     rt_memset(queue->sq_cmds, 0, sizeof(struct rt_nvme_command) * depth);
862     rt_memset(queue->cq_entry, 0, sizeof(struct rt_nvme_completion) * depth);
863 
864     if (nvme->ops->setup_queue)
865     {
866         if (!(err = nvme->ops->setup_queue(queue)))
867         {
868             LOG_E("Setup[%s] queue error = %s", nvme->ops->name, rt_strerror(err));
869 
870             goto _fail;
871         }
872     }
873 
874     return queue;
875 
876 _fail:
877     nvme_free_queue(queue);
878 
879     return rt_err_ptr(err);
880 }
881 
nvme_configure_admin_queue(struct rt_nvme_controller * nvme)882 static rt_err_t nvme_configure_admin_queue(struct rt_nvme_controller *nvme)
883 {
884     rt_err_t err;
885     int irq;
886     char name[RT_NAME_MAX];
887     rt_uint32_t aqa;
888     rt_uint32_t page_shift = ARCH_PAGE_SHIFT;
889     rt_uint32_t page_min = RT_NVME_CAP_MPSMIN(nvme->cap) + 12;
890     rt_uint32_t page_max = RT_NVME_CAP_MPSMAX(nvme->cap) + 12;
891     struct rt_nvme_queue *admin_queue;
892 
893     if (page_shift < page_min)
894     {
895         LOG_E("Device %s page size (%u) %s than host (%u)",
896                 "minimum", 1 << page_min, "larger", 1 << page_shift);
897         return -RT_EINVAL;
898     }
899 
900     if (page_shift > page_max)
901     {
902         LOG_W("Device %s page size (%u) %s than host (%u)",
903                 "maximum", 1 << page_max, "smaller", 1 << page_shift);
904         page_shift = page_max;
905     }
906 
907     if ((err = nvme_disable_ctrl(nvme)))
908     {
909         return err;
910     }
911 
912     admin_queue = nvme_alloc_queue(nvme, 0, RT_NVME_AQ_DEPTH);
913 
914     if (rt_is_err(admin_queue))
915     {
916         return rt_ptr_err(admin_queue);
917     }
918 
919     aqa = admin_queue->depth - 1;
920     aqa |= aqa << 16;
921 
922     nvme->page_shift = page_shift;
923     nvme->page_size = 1U << page_shift;
924 
925     nvme->ctrl_config = RT_NVME_CC_CSS_NVM;
926     nvme->ctrl_config |= (page_shift - 12) << RT_NVME_CC_MPS_SHIFT;
927     nvme->ctrl_config |= RT_NVME_CC_ARB_RR | RT_NVME_CC_SHN_NONE;
928     nvme->ctrl_config |= RT_NVME_CC_IOSQES | RT_NVME_CC_IOCQES;
929 
930     nvme_writel(nvme, RT_NVME_REG_AQA, aqa);
931     nvme_writeq(nvme, RT_NVME_REG_ASQ, admin_queue->sq_cmds_phy);
932     nvme_writeq(nvme, RT_NVME_REG_ACQ, admin_queue->cq_entry_phy);
933 
934     if ((err = nvme_enable_ctrl(nvme)))
935     {
936         nvme_free_queue(admin_queue);
937 
938         return err;
939     }
940 
941     irq = nvme->irqs[0];
942 
943     rt_snprintf(name, RT_NAME_MAX, "%s-admin-queue", nvme->name);
944 
945     rt_hw_interrupt_install(irq, nvme_queue_isr, &nvme->admin_queue, name);
946     rt_hw_interrupt_umask(irq);
947 
948     return RT_EOK;
949 }
950 
nvme_setup_io_queues(struct rt_nvme_controller * nvme)951 static rt_err_t nvme_setup_io_queues(struct rt_nvme_controller *nvme)
952 {
953     rt_err_t err;
954     rt_uint32_t value;
955     int irq, cpuid = 0;
956     char name[RT_NAME_MAX];
957     rt_bool_t affinity_fixup = RT_FALSE;
958     RT_IRQ_AFFINITY_DECLARE(affinity) = { 0 };
959     struct rt_nvme_queue *queue;
960 
961     nvme->io_queue_max = nvme->irqs_nr > 1 ? nvme->irqs_nr - 1 : 1;
962     value = (nvme->io_queue_max - 1) | ((nvme->io_queue_max - 1) << 16);
963 
964     if ((err = nvme_set_features_simple(nvme, RT_NVME_FEAT_NUM_QUEUES, value)))
965     {
966         return err;
967     }
968 
969     for (int i = 0, q_idx = 1; i < nvme->io_queue_max; ++i, ++q_idx)
970     {
971         queue = nvme_alloc_queue(nvme, q_idx, nvme->queue_depth);
972 
973         if (!queue)
974         {
975             return -RT_ENOMEM;
976         }
977 
978         if ((err = nvme_attach_queue_cq(queue)) ||
979             (err = nvme_attach_queue_sq(queue)))
980         {
981             return err;
982         }
983     }
984 
985     for (int i = 0, irq_idx = 1; i < nvme->io_queue_max; ++i, ++irq_idx)
986     {
987         irq = nvme->irqs[irq_idx % nvme->irqs_nr];
988 
989         rt_snprintf(name, RT_NAME_MAX, "%s-io-queue%d", nvme->name, i);
990 
991         if (!affinity_fixup)
992         {
993             RT_IRQ_AFFINITY_SET(affinity, cpuid % RT_CPUS_NR);
994             if (rt_pic_irq_set_affinity(irq, affinity))
995             {
996                 /* Fixup in secondary CPU startup */
997                 affinity_fixup = RT_TRUE;
998             }
999             RT_IRQ_AFFINITY_CLEAR(affinity, cpuid++ % RT_CPUS_NR);
1000         }
1001 
1002         rt_hw_interrupt_install(irq, nvme_queue_isr, &nvme->io_queues[i], name);
1003         rt_hw_interrupt_umask(irq);
1004     }
1005 
1006     return RT_EOK;
1007 }
1008 
nvme_remove_io_queues(struct rt_nvme_controller * nvme)1009 static void nvme_remove_io_queues(struct rt_nvme_controller *nvme)
1010 {
1011     int irq;
1012     struct rt_nvme_queue *queue;
1013 
1014     for (int i = 0, irq_idx = 1; i < nvme->io_queue_max; ++i, ++irq_idx)
1015     {
1016         queue = &nvme->io_queues[i];
1017 
1018         nvme_detach_queue(queue, RT_NVME_ADMIN_OPCODE_DELETE_SQ);
1019         nvme_detach_queue(queue, RT_NVME_ADMIN_OPCODE_DELETE_CQ);
1020         nvme_free_queue(queue);
1021 
1022         irq = nvme->irqs[irq_idx % nvme->irqs_nr];
1023 
1024         rt_hw_interrupt_mask(irq);
1025         rt_pic_detach_irq(irq, queue);
1026     }
1027 }
1028 
nvme_remove_admin_queues(struct rt_nvme_controller * nvme)1029 static void nvme_remove_admin_queues(struct rt_nvme_controller *nvme)
1030 {
1031     int irq = nvme->irqs[0];
1032 
1033     rt_hw_interrupt_mask(irq);
1034     rt_pic_detach_irq(irq, &nvme->admin_queue);
1035 
1036     nvme_free_queue(&nvme->admin_queue);
1037 }
1038 
nvme_remove_devices(struct rt_nvme_controller * nvme)1039 static void nvme_remove_devices(struct rt_nvme_controller *nvme)
1040 {
1041     struct rt_nvme_device *ndev, *next_ndev;
1042 
1043     rt_list_for_each_entry_safe(ndev, next_ndev, &nvme->ns_nodes, list)
1044     {
1045         rt_list_remove(&ndev->list);
1046 
1047         rt_hw_blk_disk_unregister(&ndev->parent);
1048         rt_free(ndev);
1049     }
1050 }
1051 
nvme_scan_device(struct rt_nvme_controller * nvme,rt_size_t number_of_ns)1052 static rt_err_t nvme_scan_device(struct rt_nvme_controller *nvme,
1053         rt_size_t number_of_ns)
1054 {
1055     rt_err_t err = RT_EOK;
1056     rt_uint32_t lbaf;
1057     struct rt_nvme_id_ns *id = RT_NULL;
1058 
1059     if (!(id = rt_malloc_align(sizeof(*id), nvme->page_size)))
1060     {
1061         return -RT_ENOMEM;
1062     }
1063 
1064     /* NVME Namespace is start with "1" */
1065     for (rt_uint32_t nsid = 1; nsid <= number_of_ns; ++nsid)
1066     {
1067         struct rt_nvme_device *ndev = rt_calloc(1, sizeof(*ndev));
1068 
1069         if (!ndev)
1070         {
1071             err = -RT_ENOMEM;
1072             goto _free_res;
1073         }
1074 
1075         rt_memset(id, 0, sizeof(*id));
1076         if ((err = nvme_identify(nvme, nsid, 0, id)))
1077         {
1078             goto _free_res;
1079         }
1080 
1081         if (!id->nsze)
1082         {
1083             continue;
1084         }
1085 
1086         ndev->ctrl = nvme;
1087 
1088         rt_memcpy(&ndev->id, id, sizeof(ndev->id));
1089         lbaf = id->flbas & RT_NVME_NS_FLBAS_LBA_MASK;
1090         lbaf |= ((id->flbas & RT_NVME_NS_FLBAS_LBA_UMASK) >> RT_NVME_NS_FLBAS_LBA_SHIFT);
1091 
1092         ndev->nsid = nsid;
1093         ndev->lba_shift = id->lbaf[lbaf].ds;
1094 
1095         ndev->parent.ida = &nvme_ida;
1096         ndev->parent.parallel_io = RT_TRUE;
1097         ndev->parent.ops = &nvme_blk_ops;
1098         ndev->parent.max_partitions = RT_BLK_PARTITION_MAX;
1099         rt_dm_dev_set_name(&ndev->parent.parent, "%sn%u", nvme->name, nsid);
1100 
1101         if ((err = rt_hw_blk_disk_register(&ndev->parent)))
1102         {
1103             goto _free_res;
1104         }
1105 
1106         rt_list_init(&ndev->list);
1107         rt_list_insert_before(&nvme->ns_nodes, &ndev->list);
1108     }
1109 
1110 _free_res:
1111     rt_free_align(id);
1112 
1113     return err;
1114 }
1115 
strip_len(const char * str,rt_size_t max_len)1116 rt_inline rt_size_t strip_len(const char *str, rt_size_t max_len)
1117 {
1118     rt_size_t size = 0;
1119 
1120     for (int i = 0; *str && i < max_len; ++i, ++str)
1121     {
1122         if (*str != ' ')
1123         {
1124             size = i + 1;
1125         }
1126     }
1127 
1128     return size;
1129 }
1130 
rt_nvme_controller_register(struct rt_nvme_controller * nvme)1131 rt_err_t rt_nvme_controller_register(struct rt_nvme_controller *nvme)
1132 {
1133     rt_err_t err;
1134     struct rt_nvme_id_ctrl *ctrl = RT_NULL;
1135 
1136     if (!nvme || !nvme->ops)
1137     {
1138         return -RT_EINVAL;
1139     }
1140 
1141     if (nvme_readl(nvme, RT_NVME_REG_CSTS) == (rt_uint32_t)-1)
1142     {
1143         LOG_E("Out of memory");
1144 
1145         return -RT_EINVAL;
1146     }
1147 
1148     if ((nvme->nvme_id = rt_dm_ida_alloc(&nvme_controller_ida)) < 0)
1149     {
1150         return -RT_EFULL;
1151     }
1152 
1153     rt_snprintf(nvme->name, RT_NAME_MAX, "nvme%u", nvme->nvme_id);
1154 
1155     nvme->cap = nvme_readq(nvme, RT_NVME_REG_CAP);
1156     nvme->queue_depth = RT_NVME_CAP_MQES(nvme->cap) + 1;
1157     nvme->doorbell_stride = 1 << RT_NVME_CAP_STRIDE(nvme->cap);
1158     nvme->doorbell_tbl = nvme->regs + RT_NVME_REG_DBS;
1159 
1160     if ((err = nvme_configure_admin_queue(nvme)))
1161     {
1162         LOG_E("Configure admin queue error = %s", rt_strerror(err));
1163         goto _free_admin_queue;
1164     }
1165 
1166     if ((err = nvme_setup_io_queues(nvme)))
1167     {
1168         LOG_E("Unable to setup I/O queues error = %s", rt_strerror(err));
1169         goto _free_admin_queue;
1170     }
1171 
1172     if (!(ctrl = rt_malloc_align(sizeof(*ctrl), nvme->page_size)))
1173     {
1174         err = -RT_ENOMEM;
1175         goto _fail;
1176     }
1177 
1178     if ((err = nvme_identify(nvme, 0, 1, ctrl)))
1179     {
1180         goto _fail;
1181     }
1182 
1183     if (ctrl->mdts)
1184     {
1185         nvme->max_transfer_shift = ctrl->mdts + (RT_NVME_CAP_MPSMIN(nvme->cap) + 12);
1186     }
1187     else
1188     {
1189         /* 1MB is recommended. */
1190         nvme->max_transfer_shift = 20;
1191     }
1192     nvme->volatile_write_cache = ctrl->vwc;
1193     nvme->write_zeroes = !!(rt_le64_to_cpu(ctrl->oncs) & RT_NVME_CTRL_ONCS_WRITE_ZEROES);
1194 
1195     if ((rt_le32_to_cpu(ctrl->sgls) & RT_NVME_ID_SGL_SUPPORT_MASK))
1196     {
1197         nvme->sgl_mode = RT_NVME_PSDT_SGL_MPTR_SGL;
1198     }
1199 
1200     LOG_I("NVM Express v%d.%d (%s, %-*.s, %-*.s)",
1201             nvme_readl(nvme, RT_NVME_REG_VS) >> 16,
1202             nvme_readl(nvme, RT_NVME_REG_VS) & 0xff,
1203             nvme->ops->name,
1204             strip_len(ctrl->mn, sizeof(ctrl->mn)), ctrl->mn,
1205             strip_len(ctrl->fr, sizeof(ctrl->fr)), ctrl->fr);
1206 
1207     rt_list_init(&nvme->ns_nodes);
1208     if ((err = nvme_scan_device(nvme, rt_le32_to_cpu(ctrl->nn))))
1209     {
1210         goto _fail;
1211     }
1212 
1213     rt_free_align(ctrl);
1214 
1215     rt_spin_lock(&nvme_lock);
1216     rt_list_insert_after(&nvme_nodes, &nvme->list);
1217     rt_spin_unlock(&nvme_lock);
1218 
1219     return RT_EOK;
1220 
1221 _fail:
1222     if (ctrl)
1223     {
1224         rt_free_align(ctrl);
1225     }
1226     nvme_remove_devices(nvme);
1227     nvme_remove_io_queues(nvme);
1228 _free_admin_queue:
1229     nvme_remove_admin_queues(nvme);
1230 
1231     rt_dm_ida_free(&nvme_controller_ida, nvme->nvme_id);
1232 
1233     return err;
1234 }
1235 
rt_nvme_controller_unregister(struct rt_nvme_controller * nvme)1236 rt_err_t rt_nvme_controller_unregister(struct rt_nvme_controller *nvme)
1237 {
1238     rt_err_t err;
1239 
1240     if (!nvme)
1241     {
1242         return -RT_EINVAL;
1243     }
1244 
1245     rt_spin_lock(&nvme_lock);
1246     rt_list_remove(&nvme->list);
1247     rt_spin_unlock(&nvme_lock);
1248 
1249     nvme_remove_devices(nvme);
1250     nvme_remove_io_queues(nvme);
1251     nvme_remove_admin_queues(nvme);
1252 
1253     rt_dm_ida_free(&nvme_controller_ida, nvme->nvme_id);
1254 
1255     if (!(err = nvme_shutdown_ctrl(nvme)))
1256     {
1257         err = nvme_disable_ctrl(nvme);
1258     }
1259     else
1260     {
1261         LOG_E("%s: shutdown error = %s", nvme->name, rt_strerror(err));
1262     }
1263 
1264     return err;
1265 }
1266 
1267 /*
1268  * NVME's IO queue should be Per-CPU, fixup the affinity after the secondary CPU
1269  * startup, this stage can make sure the affinity setting success as possible.
1270  */
nvme_queue_affinify_fixup(void)1271 static int nvme_queue_affinify_fixup(void)
1272 {
1273     int cpuid = rt_hw_cpu_id();
1274     struct rt_nvme_controller *nvme;
1275     RT_IRQ_AFFINITY_DECLARE(affinity) = { 0 };
1276     RT_IRQ_AFFINITY_DECLARE(current_affinity) = { 0 };
1277 
1278     RT_IRQ_AFFINITY_SET(affinity, cpuid);
1279 
1280     rt_hw_spin_lock(&nvme_lock.lock);
1281     rt_list_for_each_entry(nvme, &nvme_nodes, list)
1282     {
1283         for (int i = cpuid % RT_CPUS_NR; i < nvme->io_queue_max; i += RT_CPUS_NR)
1284         {
1285             int irq = nvme->irqs[i];
1286 
1287             if (!rt_pic_irq_get_affinity(irq, current_affinity) &&
1288                 !rt_bitmap_test_bit(current_affinity, cpuid))
1289             {
1290                 rt_ubase_t level = rt_hw_interrupt_disable();
1291 
1292                 rt_pic_irq_set_affinity(irq, affinity);
1293 
1294                 rt_hw_interrupt_enable(level);
1295             }
1296         }
1297     }
1298     rt_hw_spin_unlock(&nvme_lock.lock);
1299 
1300     return 0;
1301 }
1302 INIT_SECONDARY_CPU_EXPORT(nvme_queue_affinify_fixup);
1303