storage: v2 Add simple NVMe storage driver
This driver is a minimal port of the EFI NVMe driver.
It allows depthcharge to boot from an SSD supporting
the NVM Express interface (nvmexpress.org). See notes in
nvme.c for additional information on capabilities.
BACKGROUND:
Based on data from Flash Memory Summit 2014 client NVMe SSDs are
forecast to account for 60%+ of the market. They offer substantial
latency, bandwidth, and power advantages over SATA drives. According
to FMS articles these drives will be available from several vendors
including Samsung and SK Hynix.
A depthcharge driver allows IHVs to test/validate their platforms
earlier by using a common NVMe implementation. It also enables
developers who want to modify existing cros devices to use new storage
hardware.
USAGE:
There is a new CONFIG_DRIVER_NVME option in Kconfig
so one may include this driver at will.
Use this driver by adding CONFIG_DRIVER_NVME to your
.config and adding something like this to your board.c:
NvmeCtrlr *nvme = new_nvme_ctrlr(PCI_DEV(2, 0, 0));
list_insert_after(&nvme->ctrlr.list_node, &fixed_block_dev_controllers);
BUG=none
BRANCH=none
TEST=compiles for panther and can boot from NVMe device w/ board.c change
Change-Id: Ic68aff1c800145c854b2c2ea5826164b2d6a0054
Signed-off-by: Jason B. Akers <jason.b.akers@intel.com>
Reviewed-on: https://chromium-review.googlesource.com/242308
Reviewed-by: Aaron Durbin <adurbin@chromium.org>
diff --git a/src/drivers/storage/Kconfig b/src/drivers/storage/Kconfig
index 39f8016..380e882 100644
--- a/src/drivers/storage/Kconfig
+++ b/src/drivers/storage/Kconfig
@@ -69,4 +69,8 @@
bool "Look up a kernel partition from a GPT on SPI"
default n
+config DRIVER_STORAGE_NVME
+ bool "NVMe driver"
+ default n
+
source src/drivers/storage/mtd/Kconfig
diff --git a/src/drivers/storage/Makefile.inc b/src/drivers/storage/Makefile.inc
index ca68b92..d304199 100644
--- a/src/drivers/storage/Makefile.inc
+++ b/src/drivers/storage/Makefile.inc
@@ -27,4 +27,5 @@
depthcharge-$(CONFIG_DRIVER_SDHCI) += sdhci.c mem_sdhci.c
depthcharge-$(CONFIG_DRIVER_STORAGE_SDHCI_PCI) += pci_sdhci.c
depthcharge-$(CONFIG_DRIVER_STORAGE_SPI_GPT) += spi_gpt.c
+depthcharge-$(CONFIG_DRIVER_STORAGE_NVME) += nvme.c
subdirs-y += mtd
diff --git a/src/drivers/storage/nvme.c b/src/drivers/storage/nvme.c
new file mode 100644
index 0000000..41bb862
--- /dev/null
+++ b/src/drivers/storage/nvme.c
@@ -0,0 +1,942 @@
+/*
+ * NVMe storage driver for depthcharge
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+/* Documentation:
+ * This driver implements a minimal subset of the NVMe 1.0e specification.
+ * (nvmexpress.org) It is designed to balance simplicity and performance.
+ * Therefore it operates by polling the NVMe Completion Queue (CQ) for phase
+ * changes rather than utilizing interrupts. The initialization functions are
+ * processed one at a time, therefore the Admin Queue pair only supports depth
+ * 2.
+ * This driver is limited to a single IO queue pair (in addition to the
+ * mandatory Admin queue pair). The IO queue depth is configurable, but has
+ * shallow defaults to minimize host memory consumption. This driver only
+ * supports a maximum of one PRP List, limiting the maximum transfer size to
+ * 2MB (assuming 4KB memory pages).
+ *
+ * Operation:
+ * At initialization this driver allocates a pool of host memory and overlays
+ * the queue pair structures. It also statically allocates a block of memory
+ * for a PRP List, avoiding the need to allocate/free memory at IO time.
+ * Each identified NVMe namespace has a corresponding depthcharge BlockDev
+ * structure, effectively creating a new "drive" visible to higher levels.
+ *
+ * The depthcharge read/write callbacks split host requests into chunks
+ * satisfying the NVMe device's maximum transfer size limitations. Then they
+ * call the corresponding _internal_ functions to facilitate formatting of the
+ * NVMe structures in host memory. After all of the commands have been created
+ * in host memory the Submission Queue tail pointer is updated allowing the
+ * drive to process the newly submitted commands. Queuing commands allows the
+ * drive to internally optimize accesses, increasing performance. Finally, the
+ * Completion Queue phase bit is polled until it inverts, indicating that the
+ * command has completed. If the SQ is full, outstanding commands will be
+ * completed before the _internal_ function proceeds. This situation reduces
+ * effective performance and should be avoided by increasing SQ/CQ depth.
+ */
+
+#include <assert.h>
+#include <endian.h>
+#include <libpayload.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "base/cleanup_funcs.h"
+#include "drivers/storage/blockdev.h"
+#include "drivers/storage/nvme.h"
+
+/* Read 64bits from register space */
+static uint64_t readll(uintptr_t _a)
+{
+ uint64_t _v;
+ uint32_t *v = (uint32_t *)&_v;
+
+ v[0] = readl(_a);
+ v[1] = readl(_a + sizeof(uint32_t));
+ return le64toh(_v);
+}
+
+/* Write 64bits to register space */
+static void writell(uint64_t _v, volatile const uintptr_t _a)
+{
+ uint32_t *v = (uint32_t *)&_v;
+
+ _v = htole64(_v);
+ writel(v[0], _a);
+ writel(v[1], _a + sizeof(uint32_t));
+}
+
+DEBUG(
+static void nvme_dump_status(NVME_CQ volatile *cq) {
+ printf("Dump NVMe Completion Entry Status from [%p]:\n", cq);
+
+ printf(" SQ ID : [0x%x], Phase Tag : [%d], Cmd ID : [0x%x] Flags : [0x%x]\n",
+ cq->sqid, cq->flags & NVME_CQ_FLAGS_PHASE, cq->cid, cq->flags);
+
+ if (NVME_CQ_FLAGS_SCT(cq->flags) == 0) {
+ if (NVME_CQ_FLAGS_SC(cq->flags) == 0)
+ printf(" NVMe Cmd Execution Result - Successful\n");
+ else
+ printf(" NVMe Cmd Execution Result - error sc=%u\n",NVME_CQ_FLAGS_SC(cq->flags));
+ } else
+ printf(" NVMe Cmd Execution Result - error sct=%u\n",NVME_CQ_FLAGS_SCT(cq->flags));
+}
+) //DEBUG
+
+/* Disables and resets the NVMe controller */
+static NVME_STATUS nvme_disable_controller(NvmeCtrlr *ctrlr) {
+ NVME_CC cc;
+ uint8_t timeout;
+
+ /* Read controller configuration */
+ cc = readl(ctrlr->ctrlr_regs + NVME_CC_OFFSET);
+ CLR(cc, NVME_CC_EN);
+ /* Write controller configuration */
+ writel_with_flush(cc, ctrlr->ctrlr_regs + NVME_CC_OFFSET);
+ /* Delay up to CAP.TO ms for CSTS.RDY to clear*/
+ if (NVME_CAP_TO(ctrlr->cap) == 0)
+ timeout = 1;
+ else
+ timeout = NVME_CAP_TO(ctrlr->cap);
+
+ if (WAIT_WHILE(
+ ((readl(ctrlr->ctrlr_regs + NVME_CSTS_OFFSET) & NVME_CSTS_RDY) == 1),
+ timeout)) {
+ return NVME_TIMEOUT;
+ }
+
+ return NVME_SUCCESS;
+}
+
+/* Enables controller and verifies that it's ready */
+static NVME_STATUS nvme_enable_controller(NvmeCtrlr *ctrlr) {
+ NVME_CC cc = 0;
+ uint8_t timeout;
+
+ SET(cc, NVME_CC_EN);
+ cc |= NVME_CC_IOSQES(6); /* Spec. recommended values */
+ cc |= NVME_CC_IOCQES(4); /* Spec. recommended values */
+ /* Write controller configuration. */
+ writel_with_flush(cc, ctrlr->ctrlr_regs + NVME_CC_OFFSET);
+
+ /* Delay up to CAP.TO ms for CSTS.RDY to set*/
+ if (NVME_CAP_TO(ctrlr->cap) == 0)
+ timeout = 1;
+ else
+ timeout = NVME_CAP_TO(ctrlr->cap);
+
+ if (WAIT_WHILE(
+ ((readl(ctrlr->ctrlr_regs + NVME_CSTS_OFFSET) & NVME_CSTS_RDY) == 0),
+ timeout)) {
+ return NVME_TIMEOUT;
+ }
+
+ return NVME_SUCCESS;
+}
+
+/* Submit and complete 1 command by polling CQ for phase change
+ * Rings SQ doorbell, polls waiting for completion, rings CQ doorbell
+ *
+ * ctrlr: NVMe controller handle
+ * qid: Queue Identifier for the SQ/CQ containing the new command
+ * sqsize: Number of commands (size) of the submission queue
+ * cqsize: Number of commands (size) of the completion queue
+ * timeout_ms: How long in milliseconds to wait for command completion
+ */
+static NVME_STATUS nvme_submit_cmd_polled(NvmeCtrlr *ctrlr,
+ uint16_t qid,
+ uint32_t sqsize,
+ uint32_t cqsize,
+ uint32_t timeout_ms) {
+ NVME_CQ *cq;
+
+ if (NULL == ctrlr)
+ return NVME_INVALID_PARAMETER;
+ if (qid > (NVME_NUM_QUEUES - 1))
+ return NVME_INVALID_PARAMETER;
+ if (timeout_ms == 0)
+ timeout_ms = 1;
+
+ cq = ctrlr->cq_buffer[qid] + ctrlr->cq_h_dbl[qid];
+
+ /* Update SQ tail index in host memory */
+ if (++(ctrlr->sq_t_dbl[qid]) > (sqsize-1))
+ ctrlr->sq_t_dbl[qid] = 0;
+
+ /* Ring the submission queue doorbell */
+ writel_with_flush(ctrlr->sq_t_dbl[qid],
+ ctrlr->ctrlr_regs +
+ NVME_SQTDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
+
+ /* Wait for phase to change (or timeout) */
+ if (WAIT_WHILE(
+ ((readw(&(cq->flags)) & NVME_CQ_FLAGS_PHASE) == ctrlr->pt[qid]),
+ timeout_ms)) {
+ printf("nvme_submit_cmd_polled: ERROR - timeout\n");
+ return NVME_TIMEOUT;
+ }
+
+ /* Dump completion entry status for debugging. */
+ DEBUG(nvme_dump_status(cq);)
+
+ /* Update the completion queue head index, queue phase if necessary */
+ if (++(ctrlr->cq_h_dbl[qid]) > (cqsize-1)) {
+ ctrlr->cq_h_dbl[qid] = 0;
+ ctrlr->pt[qid] ^= 1;
+ }
+ /* Update SQ head pointer */
+ ctrlr->sqhd[qid] = cq->sqhd;
+
+ /* Ring the CQ doorbell */
+ writel_with_flush(ctrlr->cq_h_dbl[qid],
+ ctrlr->ctrlr_regs +
+ NVME_CQHDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
+
+ return NVME_SUCCESS;
+}
+
+/* Add command to host SQ, don't write to HW SQ yet
+ *
+ * ctrlr: NVMe controller handle
+ * qid: Queue Identifier for the SQ/CQ containing the new command
+ * sqsize: Size of the submission queue
+ */
+static NVME_STATUS nvme_submit_cmd(NvmeCtrlr *ctrlr, uint16_t qid, uint32_t sqsize) {
+ if (NULL == ctrlr)
+ return NVME_INVALID_PARAMETER;
+ if (qid > (NVME_NUM_QUEUES - 1))
+ return NVME_INVALID_PARAMETER;
+
+ /* Update the submission queue tail in host memory */
+ if (++(ctrlr->sq_t_dbl[qid]) > (sqsize-1))
+ ctrlr->sq_t_dbl[qid] = 0;
+
+ return NVME_SUCCESS;
+}
+
+/* Ring SQ doorbell register, submitting all outstanding command to HW
+ *
+ * ctrlr: NVMe controller handle
+ * qid: Queue Identifier for the SQ/CQ containing the new command
+ */
+static NVME_STATUS nvme_ring_sq_doorbell(NvmeCtrlr *ctrlr, uint16_t qid) {
+ if (NULL == ctrlr)
+ return NVME_INVALID_PARAMETER;
+ if (qid > (NVME_NUM_QUEUES - 1))
+ return NVME_INVALID_PARAMETER;
+
+ /* Ring SQ doorbell by writing SQ tail index to controller */
+ writel_with_flush(ctrlr->sq_t_dbl[qid],
+ ctrlr->ctrlr_regs +
+ NVME_SQTDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
+
+ return NVME_SUCCESS;
+}
+
+/* Poll for completion of all commands from HW
+ *
+ * ctrlr: NVMe controller handle
+ * qid: Queue Identifier for the SQ/CQ containing the new command
+ * cqsize: Size of the completion queue
+ * timeout_ms: How long in milliseconds to wait for command completion
+ */
+static NVME_STATUS nvme_complete_cmds_polled(NvmeCtrlr *ctrlr,
+ uint16_t qid,
+ uint32_t cqsize,
+ uint32_t timeout_ms) {
+ NVME_CQ *cq;
+ uint32_t ncmds;
+
+ if (NULL == ctrlr)
+ return NVME_INVALID_PARAMETER;
+ if (qid > (NVME_NUM_QUEUES - 1))
+ return NVME_INVALID_PARAMETER;
+ if (timeout_ms == 0)
+ timeout_ms = 1;
+
+ /* We will complete all outstanding commands */
+ if (ctrlr->cq_h_dbl[qid] < ctrlr->sq_t_dbl[qid])
+ ncmds = ctrlr->sq_t_dbl[qid] - ctrlr->cq_h_dbl[qid];
+ else
+ ncmds = (cqsize - ctrlr->cq_h_dbl[qid]) + ctrlr->sq_t_dbl[qid];
+ DEBUG(printf("nvme_complete_cmds_polled: completing %u commands\n",(unsigned)ncmds);)
+
+ while (ncmds--) {
+ cq = ctrlr->cq_buffer[qid] + ctrlr->cq_h_dbl[qid];
+ /* Wait for phase to change (or timeout) */
+ if (WAIT_WHILE(
+ ((readw(&(cq->flags)) & NVME_CQ_FLAGS_PHASE) == ctrlr->pt[qid]),
+ timeout_ms)) {
+ printf("nvme_complete_cmds_polled: ERROR - timeout\n");
+ return NVME_TIMEOUT;
+ }
+
+ /* Dump completion entry status for debugging. */
+ DEBUG(nvme_dump_status(cq);)
+
+ /* Update the doorbell, queue phase, and queue command id if necessary */
+ if (++(ctrlr->cq_h_dbl[qid]) > (cqsize-1)) {
+ ctrlr->cq_h_dbl[qid] = 0;
+ ctrlr->pt[qid] ^= 1;
+ ctrlr->cid[qid] = 0;
+ }
+ /* Update SQ head pointer */
+ ctrlr->sqhd[qid] = cq->sqhd;
+ }
+
+ /* Ring the completion queue doorbell register*/
+ writel_with_flush(ctrlr->cq_h_dbl[qid], ctrlr->ctrlr_regs + NVME_CQHDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
+
+ return NVME_SUCCESS;
+}
+
+/* Creates a single IO completion queue */
+static NVME_STATUS nvme_create_cq(NvmeCtrlr *ctrlr, uint16_t qid, uint16_t qsize) {
+ NVME_SQ *sq;
+ int status = NVME_SUCCESS;
+
+ sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
+
+ memset(sq, 0, sizeof(NVME_SQ));
+
+ sq->opc = NVME_ADMIN_CRIOCQ_OPC;
+ sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
+
+ /* Only physically contiguous address supported */
+ sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->cq_buffer[qid]);
+ /* Set physically contiguous (PC) bit */
+ sq->cdw11 = 1;
+
+ sq->cdw10 |= NVME_ADMIN_CRIOCQ_QID(qid);
+ sq->cdw10 |= NVME_ADMIN_CRIOCQ_QSIZE(qsize);
+
+ status = nvme_submit_cmd_polled(ctrlr,
+ NVME_ADMIN_QUEUE_INDEX,
+ NVME_ASQ_SIZE,
+ NVME_ACQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+
+ return status;
+}
+
+/* Creates a single IO submission queue
+ * NOTE: Assumes that completion queue ID == submission queue ID
+ */
+static NVME_STATUS nvme_create_sq(NvmeCtrlr *ctrlr, uint16_t qid, uint16_t qsize) {
+ NVME_SQ *sq;
+ int status = NVME_SUCCESS;
+
+ sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
+
+ memset(sq, 0, sizeof(NVME_SQ));
+
+ sq->opc = NVME_ADMIN_CRIOSQ_OPC;
+ sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
+
+ /* Only physically contiguous address supported */
+ sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->sq_buffer[qid]);
+ /* Set physically contiguous (PC) bit */
+ sq->cdw11 = 1;
+ sq->cdw11 |= NVME_ADMIN_CRIOSQ_CQID(qid);
+
+ sq->cdw10 |= NVME_ADMIN_CRIOSQ_QID(qid);
+ sq->cdw10 |= NVME_ADMIN_CRIOSQ_QSIZE(qsize);
+
+ status = nvme_submit_cmd_polled(ctrlr,
+ NVME_ADMIN_QUEUE_INDEX,
+ NVME_ASQ_SIZE,
+ NVME_ACQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+
+ return status;
+}
+
+/* Generate PRPs for a single virtual memory buffer
+ * prp_list: pre-allocated prp list buffer
+ * prp: pointer to SQ PRP array
+ * buffer: host buffer for request
+ * size: number of bytes in request
+ */
+static NVME_STATUS nvme_fill_prp(PrpList *prp_list, uint64_t *prp, void *buffer, uint64_t size)
+{
+ uint64_t offset = (uintptr_t)buffer & (NVME_PAGE_SIZE - 1);
+ uint64_t xfer_pages;
+ uintptr_t buffer_phys = virt_to_phys(buffer);
+
+ /* PRP0 is always the (potentially unaligned) start of the buffer */
+ prp[0] = buffer_phys;
+ /* Increment buffer to the next aligned page */
+ if (ALIGN(buffer_phys,NVME_PAGE_SIZE) == buffer_phys)
+ buffer_phys += NVME_PAGE_SIZE;
+ else
+ buffer_phys = ALIGN_UP(buffer_phys,NVME_PAGE_SIZE);
+
+ /* Case 1: all data will fit in 2 PRP entries (accounting for buffer offset) */
+ if ((size + offset) <= (2 * NVME_PAGE_SIZE)) {
+ prp[1] = buffer_phys;
+ return NVME_SUCCESS;
+ }
+
+ /* Case 2: Need to build up to one PRP List */
+ xfer_pages = (ALIGN((size + offset), NVME_PAGE_SIZE) >> NVME_PAGE_SHIFT);
+ /* Don't count first prp entry as it is the beginning of buffer */
+ xfer_pages--;
+ /* Make sure this transfer fits into one PRP list */
+ if (xfer_pages > (NVME_MAX_XFER_BYTES/NVME_PAGE_SIZE))
+ return NVME_INVALID_PARAMETER;
+
+ /* Fill the PRP List */
+ prp[1] = (uintptr_t)virt_to_phys(prp_list);
+ for (uint32_t entry_index = 0; entry_index < xfer_pages; entry_index++) {
+ prp_list->prp_entry[entry_index] = buffer_phys;
+ buffer_phys += NVME_PAGE_SIZE;
+ }
+ return NVME_SUCCESS;
+}
+
+/* Sets up read operation for up to max_transfer blocks */
+static NVME_STATUS nvme_internal_read(NvmeDrive *drive, void *buffer, lba_t start, lba_t count)
+{
+ NvmeCtrlr *ctrlr = drive->ctrlr;
+ NVME_SQ *sq;
+ int status = NVME_SUCCESS;
+
+ if (count == 0)
+ return NVME_INVALID_PARAMETER;
+
+ /* If queue is full, need to complete inflight commands before submitting more */
+ if ((ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX] + 1) % ctrlr->iosq_sz == ctrlr->sqhd[NVME_IO_QUEUE_INDEX]) {
+ DEBUG(printf("nvme_internal_read: Too many outstanding commands. Completing in-flights\n");)
+ /* Submit commands to controller */
+ nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
+ /* Complete submitted command(s) */
+ status = nvme_complete_cmds_polled(ctrlr,
+ NVME_IO_QUEUE_INDEX,
+ NVME_CCQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+ if (NVME_ERROR(status)) {
+ printf("nvme_internal_read: error %d completing outstanding commands\n",status);
+ return status;
+ }
+ }
+
+ sq = ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX];
+
+ memset(sq, 0, sizeof(NVME_SQ));
+
+ sq->opc = NVME_IO_READ_OPC;
+ sq->cid = ctrlr->cid[NVME_IO_QUEUE_INDEX]++;
+ sq->nsid = drive->namespace_id;
+
+ status = nvme_fill_prp(ctrlr->prp_list[sq->cid], sq->prp, buffer, count * drive->dev.block_size);
+ if (NVME_ERROR(status)) {
+ printf("nvme_internal_read: error %d generating PRP(s)\n",status);
+ return status;
+ }
+
+ sq->cdw10 = start;
+ sq->cdw11 = (start >> 32);
+ sq->cdw12 = (count - 1) & 0xFFFF;
+
+ status = nvme_submit_cmd(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
+
+ return status;
+}
+
+/* Read operation entrypoint
+ * Cut operation into max_transfer chunks and do it
+ */
+static lba_t nvme_read(BlockDevOps *me, lba_t start, lba_t count, void *buffer)
+{
+ NvmeDrive *drive = container_of(me, NvmeDrive, dev.ops);
+ NvmeCtrlr *ctrlr = drive->ctrlr;
+ uint64_t max_transfer_blocks = 0;
+ uint32_t block_size = drive->dev.block_size;
+ lba_t orig_count = count;
+ int status = NVME_SUCCESS;
+
+ DEBUG(printf("nvme_read: Reading from namespace %d\n",drive->namespace_id);)
+
+ if (ctrlr->controller_data->mdts != 0)
+ max_transfer_blocks = ((1 << (ctrlr->controller_data->mdts)) * (1 << NVME_CAP_MPSMIN(ctrlr->cap))) / block_size;
+ /* Artificially limit max_transfer_blocks to 1 PRP List */
+ if ( (max_transfer_blocks == 0) ||
+ (max_transfer_blocks > (NVME_MAX_XFER_BYTES * block_size)))
+ max_transfer_blocks = NVME_MAX_XFER_BYTES / block_size;
+
+ while (count > 0) {
+ if (count > max_transfer_blocks) {
+ DEBUG(printf("nvme_read: partial read of %llu blocks\n",(unsigned long long)max_transfer_blocks);)
+ status = nvme_internal_read(drive, buffer, start, max_transfer_blocks);
+ count -= max_transfer_blocks;
+ buffer += max_transfer_blocks*block_size;
+ start += max_transfer_blocks;
+ } else {
+ DEBUG(printf("nvme_read: final read of %llu blocks\n",(unsigned long long)count);)
+ status = nvme_internal_read(drive, buffer, start, count);
+ count = 0;
+ }
+ if (NVME_ERROR(status))
+ break;
+ }
+
+ if (NVME_ERROR(status)) {
+ printf("nvme_read: error %d\n",status);
+ return -1;
+ }
+
+ /* Submit commands to controller */
+ nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
+ /* Complete submitted command(s) */
+ nvme_complete_cmds_polled(ctrlr,
+ NVME_IO_QUEUE_INDEX,
+ NVME_CCQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+
+ DEBUG(printf("nvme_read: lba = 0x%08x, Original = 0x%08x, Remaining = 0x%08x, BlockSize = 0x%x Status = %d\n", (uint32_t)start, (uint32_t)orig_count, (uint32_t)count, block_size, status);)
+
+ return orig_count - count;
+}
+
+/* Sets up write operation for up to max_transfer blocks */
+static NVME_STATUS nvme_internal_write(NvmeDrive *drive, void *buffer, lba_t start, lba_t count)
+{
+ NvmeCtrlr *ctrlr = drive->ctrlr;
+ NVME_SQ *sq;
+ int status = NVME_SUCCESS;
+
+ if (count == 0)
+ return NVME_INVALID_PARAMETER;
+
+ /* If queue is full, need to complete inflight commands before submitting more */
+ if ((ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX] + 1) % ctrlr->iosq_sz == ctrlr->sqhd[NVME_IO_QUEUE_INDEX]) {
+ DEBUG(printf("nvme_internal_write: Too many outstanding commands. Completing in-flights\n");)
+ /* Submit commands to controller */
+ nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
+ /* Complete submitted command(s) */
+ status = nvme_complete_cmds_polled(ctrlr,
+ NVME_IO_QUEUE_INDEX,
+ NVME_CCQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+ if (NVME_ERROR(status)) {
+ printf("nvme_internal_read: error %d completing outstanding commands\n",status);
+ return status;
+ }
+ }
+
+ sq = ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX];
+
+ memset(sq, 0, sizeof(NVME_SQ));
+
+ sq->opc = NVME_IO_WRITE_OPC;
+ sq->cid = ctrlr->cid[NVME_IO_QUEUE_INDEX]++;
+ sq->nsid = drive->namespace_id;
+
+ status = nvme_fill_prp(ctrlr->prp_list[sq->cid], sq->prp, buffer, count * drive->dev.block_size);
+ if (NVME_ERROR(status)) {
+ printf("nvme_internal_write: error %d generating PRP(s)\n",status);
+ return status;
+ }
+
+ sq->cdw10 = start;
+ sq->cdw11 = (start >> 32);
+ sq->cdw12 = (count - 1) & 0xFFFF;
+
+ status = nvme_submit_cmd(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
+
+ return status;
+}
+
+/* Write operation entrypoint
+ * Cut operation into max_transfer chunks and do it
+ */
+static lba_t nvme_write(BlockDevOps *me, lba_t start, lba_t count,
+ const void *buffer)
+{
+ NvmeDrive *drive = container_of(me, NvmeDrive, dev.ops);
+ NvmeCtrlr *ctrlr = drive->ctrlr;
+ uint64_t max_transfer_blocks = 0;
+ uint32_t block_size = drive->dev.block_size;
+ lba_t orig_count = count;
+ int status = NVME_SUCCESS;
+
+ DEBUG(printf("nvme_write: Writing to namespace %d\n",drive->namespace_id);)
+
+ if (ctrlr->controller_data->mdts != 0)
+ max_transfer_blocks = ((1 << (ctrlr->controller_data->mdts)) * (1 << NVME_CAP_MPSMIN(ctrlr->cap))) / block_size;
+ /* Artificially limit max_transfer_blocks to 1 PRP List */
+ if ( (max_transfer_blocks == 0) ||
+ (max_transfer_blocks > (NVME_MAX_XFER_BYTES * block_size)))
+ max_transfer_blocks = NVME_MAX_XFER_BYTES / block_size;
+
+ while (count > 0) {
+ if (count > max_transfer_blocks) {
+ DEBUG(printf("nvme_write: partial write of %llu blocks\n",(unsigned long long)max_transfer_blocks);)
+ status = nvme_internal_write(drive, (void *)buffer, start, max_transfer_blocks);
+ count -= max_transfer_blocks;
+ buffer += max_transfer_blocks*block_size;
+ start += max_transfer_blocks;
+ } else {
+ DEBUG(printf("nvme_write final write of %llu blocks\n",(unsigned long long)count);)
+ status = nvme_internal_write(drive, (void *)buffer, start, count);
+ count = 0;
+ }
+ if (NVME_ERROR(status))
+ break;
+ }
+
+ if (NVME_ERROR(status)) {
+ printf("nvme_write: error %d\n",status);
+ return -1;
+ }
+
+ /* Submit commands to controller */
+ nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
+ /* Complete submitted command(s) */
+ nvme_complete_cmds_polled(ctrlr,
+ NVME_IO_QUEUE_INDEX,
+ NVME_CCQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+
+ DEBUG(printf("nvme_write: lba = 0x%08x, Original = 0x%08x, Remaining = 0x%08x, BlockSize = 0x%x Status = %d\n", (uint32_t)start, (uint32_t)orig_count, (uint32_t)count, block_size, status);)
+
+ return orig_count - count;
+}
+
+/* Sends the Identify command, saves result in ctrlr->controller_data*/
+static NVME_STATUS nvme_identify(NvmeCtrlr *ctrlr) {
+ NVME_SQ *sq;
+ int status = NVME_SUCCESS;
+
+ ctrlr->controller_data = dma_memalign(NVME_PAGE_SIZE, sizeof(NVME_ADMIN_CONTROLLER_DATA));
+ if (ctrlr->controller_data == NULL) {
+ printf("nvme_identify: ERROR - out of memory\n");
+ return NVME_OUT_OF_RESOURCES;
+ }
+
+ sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
+
+ memset(sq, 0, sizeof(NVME_SQ));
+
+ sq->opc = NVME_ADMIN_IDENTIFY_OPC;
+ sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
+
+ /* Identify structure is 4Kb in size. Fits in aligned 1 PAGE */
+ sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->controller_data);
+ /* Set bit 0 (Cns bit) to 1 to identify a controller */
+ sq->cdw10 = 1;
+
+ status = nvme_submit_cmd_polled(ctrlr,
+ NVME_ADMIN_QUEUE_INDEX,
+ NVME_ASQ_SIZE,
+ NVME_ACQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+ if (NVME_ERROR(status))
+ return status;
+
+ ctrlr->controller_data->sn[19] = 0;
+ ctrlr->controller_data->mn[39] = 0;
+ DEBUG(printf(" == NVME IDENTIFY CONTROLLER DATA ==\n");)
+ DEBUG(printf(" PCI VID : 0x%x\n", ctrlr->controller_data->vid);)
+ DEBUG(printf(" PCI SSVID : 0x%x\n", ctrlr->controller_data->ssvid);)
+ DEBUG(printf(" SN : %s\n", (char *)(ctrlr->controller_data->sn));)
+ DEBUG(printf(" MN : %s\n", (char *)(ctrlr->controller_data->mn));)
+ DEBUG(printf(" RAB : 0x%x\n", ctrlr->controller_data->rab);)
+ DEBUG(printf(" AERL : 0x%x\n", ctrlr->controller_data->aerl);)
+ DEBUG(printf(" SQES : 0x%x\n", ctrlr->controller_data->sqes);)
+ DEBUG(printf(" CQES : 0x%x\n", ctrlr->controller_data->cqes);)
+ DEBUG(printf(" NN : 0x%x\n", ctrlr->controller_data->nn);)
+
+ return status;
+}
+
+/* Sends the Identify Namespace command, creates NvmeDrives for each namespace */
+static NVME_STATUS nvme_identify_namespaces(NvmeCtrlr *ctrlr) {
+ NVME_SQ *sq;
+ NVME_ADMIN_NAMESPACE_DATA *namespace_data = NULL;
+ int status = NVME_SUCCESS;
+
+ if (ctrlr->controller_data == NULL) {
+ printf("nvme_identify_namespaces: ERROR - must complete Identify command first\n");
+ return NVME_INVALID_PARAMETER;
+ }
+
+ namespace_data = dma_memalign(NVME_PAGE_SIZE, sizeof(NVME_ADMIN_NAMESPACE_DATA));
+ if (namespace_data == NULL) {
+ printf("nvme_identify_namespaces: ERROR - out of memory\n");
+ return NVME_OUT_OF_RESOURCES;
+ }
+
+ for (uint32_t index = 1; index <= ctrlr->controller_data->nn; index++) {
+ DEBUG(printf("nvme_identify_namespaces: Working on namespace %d\n",index);)
+
+ sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
+
+ memset(sq, 0, sizeof(NVME_SQ));
+
+ sq->opc = NVME_ADMIN_IDENTIFY_OPC;
+ sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
+ sq->nsid = index;
+
+ /* Identify structure is 4Kb in size. Fits in 1 aligned PAGE */
+ sq->prp[0] = (uintptr_t)virt_to_phys(namespace_data);
+ /* Clear bit 0 (Cns bit) to identify a namespace */
+
+ status = nvme_submit_cmd_polled(ctrlr,
+ NVME_ADMIN_QUEUE_INDEX,
+ NVME_ASQ_SIZE,
+ NVME_ACQ_SIZE,
+ NVME_GENERIC_TIMEOUT);
+ if (NVME_ERROR(status))
+ goto exit;
+
+ DEBUG(printf(" == NVME IDENTIFY NAMESPACE [%d] DATA ==\n", index);)
+ DEBUG(printf(" NSZE : 0x%llx\n", namespace_data->nsze);)
+ DEBUG(printf(" NCAP : 0x%llx\n", namespace_data->ncap);)
+ DEBUG(printf(" NUSE : 0x%llx\n", namespace_data->nuse);)
+ DEBUG(printf(" LBAF0.LBADS : 0x%x\n", (namespace_data->lba_format[0].lbads));)
+
+ if (namespace_data->ncap == 0) {
+ printf("nvme_identify_namespaces: ERROR - namespace %d has zero capacity\n", index);
+ status = NVME_DEVICE_ERROR;
+ goto exit;
+ } else {
+ /* Create drive node. */
+ NvmeDrive *nvme_drive = xzalloc(sizeof(*nvme_drive));
+ static const int name_size = 21;
+ char *name = xmalloc(name_size);
+ snprintf(name, name_size, "NVMe Namespace %d", index);
+ nvme_drive->dev.ops.read = &nvme_read;
+ nvme_drive->dev.ops.write = &nvme_write;
+ nvme_drive->dev.ops.new_stream = &new_simple_stream;
+ nvme_drive->dev.name = name;
+ nvme_drive->dev.removable = 0;
+ nvme_drive->dev.block_size = 2 << (namespace_data->lba_format[namespace_data->flbas & 0xF].lbads - 1);
+ nvme_drive->dev.block_count = namespace_data->nsze;
+ nvme_drive->ctrlr = ctrlr;
+ nvme_drive->namespace_id = index;
+ list_insert_after(&nvme_drive->dev.list_node,
+ &fixed_block_devices);
+ list_insert_after(&nvme_drive->list_node, &ctrlr->drives);
+ printf("Added NVMe drive \"%s\" lbasize:%d, count:0x%llx\n", nvme_drive->dev.name, nvme_drive->dev.block_size, (uint64_t)nvme_drive->dev.block_count);
+ }
+ }
+
+exit:
+ if (namespace_data != NULL)
+ free(namespace_data);
+
+ return status;
+}
+
+/* Initialization entrypoint */
+static int nvme_ctrlr_init(BlockDevCtrlrOps *me)
+{
+ NvmeCtrlr *ctrlr = container_of(me, NvmeCtrlr, ctrlr.ops);
+ pcidev_t dev = ctrlr->dev;
+ int status = NVME_SUCCESS;
+
+ if ((pci_read_config8(ctrlr->dev, REG_PROG_IF) != PCI_IF_NVMHCI)
+ || (pci_read_config8(ctrlr->dev, REG_SUBCLASS) != PCI_CLASS_MASS_STORAGE_NVM)
+ || (pci_read_config8(ctrlr->dev, REG_CLASS) != PCI_CLASS_MASS_STORAGE)) {
+ printf("Unsupported NVMe controller found\n");
+ status = NVME_UNSUPPORTED;
+ goto exit;
+ }
+
+ printf("Initializing NVMe controller %04x:%04x\n",
+ pci_read_config16(ctrlr->dev, REG_VENDOR_ID),
+ pci_read_config16(ctrlr->dev, REG_DEVICE_ID));
+
+ pci_set_bus_master(dev);
+
+ /* Read the Controller Capabilities register */
+ ctrlr->ctrlr_regs = pci_read_resource(dev,0);
+ ctrlr->ctrlr_regs = ctrlr->ctrlr_regs & ~0x7;
+ ctrlr->cap = readll(ctrlr->ctrlr_regs + NVME_CAP_OFFSET);
+
+ /* Verify that the NVM command set is supported */
+ if (NVME_CAP_CSS(ctrlr->cap) != NVME_CAP_CSS_NVM) {
+ printf("NVMe Cap CSS not NVMe (CSS=%01x. Unsupported controller.\n",(uint8_t)NVME_CAP_CSS(ctrlr->cap));
+ status = NVME_UNSUPPORTED;
+ goto exit;
+ }
+
+ /* Driver only supports 4k page size */
+ if (NVME_CAP_MPSMIN(ctrlr->cap) > NVME_PAGE_SHIFT) {
+ printf("NVMe driver only supports 4k page size. Unsupported controller.\n");
+ status = NVME_UNSUPPORTED;
+ goto exit;
+ }
+
+ /* Calculate max io sq/cq sizes based on MQES */
+ ctrlr->iosq_sz = (NVME_CSQ_SIZE > NVME_CAP_MQES(ctrlr->cap)) ? NVME_CAP_MQES(ctrlr->cap) : NVME_CSQ_SIZE;
+ ctrlr->iocq_sz = (NVME_CCQ_SIZE > NVME_CAP_MQES(ctrlr->cap)) ? NVME_CAP_MQES(ctrlr->cap) : NVME_CCQ_SIZE;
+ DEBUG(printf("iosq_sz = %u, iocq_sz = %u\n",ctrlr->iosq_sz,ctrlr->iocq_sz);)
+
+ /* Allocate enough PRP List memory for max queue depth commands */
+ for (unsigned int list_index = 0; list_index < ctrlr->iosq_sz; list_index++) {
+ ctrlr->prp_list[list_index] = dma_memalign(NVME_PAGE_SIZE, NVME_PAGE_SIZE);
+ if (!(ctrlr->prp_list[list_index])) {
+ printf("NVMe driver failed to allocate prp list %u memory\n",list_index);
+ status = NVME_OUT_OF_RESOURCES;
+ goto exit;
+ }
+ memset(ctrlr->prp_list[list_index], 0, NVME_PAGE_SIZE);
+ }
+
+ /* Allocate queue memory block */
+ ctrlr->buffer = dma_memalign(NVME_PAGE_SIZE, (NVME_NUM_QUEUES * 2) * NVME_PAGE_SIZE);
+ if (!(ctrlr->buffer)) {
+ printf("NVMe driver failed to allocate queue buffer\n");
+ status = NVME_OUT_OF_RESOURCES;
+ goto exit;
+ }
+ memset(ctrlr->buffer, 0, (NVME_NUM_QUEUES * 2) * NVME_PAGE_SIZE);
+
+ /* Disable controller */
+ status = nvme_disable_controller(ctrlr);
+ if (NVME_ERROR(status))
+ goto exit;
+
+ /* Create Admin queue pair */
+ NVME_AQA aqa = 0;
+ NVME_ASQ asq = 0;
+ NVME_ACQ acq = 0;
+
+ /* Verify defined queue sizes are within NVME_PAGE_SIZE limits */
+ #if NVME_ASQ_SIZE != 2
+ #error "Unsupported Admin SQ size defined"
+ #endif
+ #if NVME_ACQ_SIZE != 2
+ #error "Unsupported Admin CQ size defined"
+ #endif
+ #if (NVME_CSQ_SIZE < 2) || (NVME_CSQ_SIZE > (NVME_PAGE_SIZE / 64))
+ #error "Unsupported IO SQ size defined"
+ #endif
+ #if (NVME_CCQ_SIZE < 2) || (NVME_CCQ_SIZE > (NVME_PAGE_SIZE / 64))
+ #error "Unsupported IO CQ size defined"
+ #endif
+
+ /* Set number of entries Admin submission & completion queues. */
+ aqa |= NVME_AQA_ASQS(NVME_ASQ_SIZE);
+ aqa |= NVME_AQA_ACQS(NVME_ACQ_SIZE);
+ /* Address of Admin submission queue. */
+ asq = (uintptr_t)virt_to_phys(ctrlr->buffer);
+ ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] = (NVME_SQ *)ctrlr->buffer;
+ /* Address of Admin completion queue. */
+ acq = (uintptr_t)virt_to_phys(ctrlr->buffer + NVME_PAGE_SIZE);
+ ctrlr->cq_buffer[NVME_ADMIN_QUEUE_INDEX] = (NVME_CQ *)(ctrlr->buffer + NVME_PAGE_SIZE);
+ /* Address of I/O submission & completion queues */
+ ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] =
+ (NVME_SQ *)(ctrlr->buffer + 2 * NVME_PAGE_SIZE);
+ ctrlr->cq_buffer[NVME_IO_QUEUE_INDEX] =
+ (NVME_CQ *)(ctrlr->buffer + 3 * NVME_PAGE_SIZE);
+
+ DEBUG(printf("Private->Buffer = [%p]\n", (void *)virt_to_phys(ctrlr->buffer));)
+ DEBUG(printf("Admin Queue Attributes = [%X]\n", aqa);)
+ DEBUG(printf("Admin Submission Queue (sq_buffer[ADMIN]) = [%p]\n", (void *)virt_to_phys(ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX]));)
+ DEBUG(printf("Admin Completion Queue (cq_buffer[ADMIN]) = [%p]\n", (void *)virt_to_phys(ctrlr->cq_buffer[NVME_ADMIN_QUEUE_INDEX]));)
+ DEBUG(printf("I/O Submission Queue (sq_buffer[NVME_IO_QUEUE]) = [%p]\n", (void *)virt_to_phys(ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX]));)
+ DEBUG(printf("I/O Completion Queue (cq_buffer[NVME_IO_QUEUE]) = [%p]\n", (void *)virt_to_phys(ctrlr->cq_buffer[NVME_IO_QUEUE_INDEX]));)
+
+ /* Write AQA */
+ writel(aqa, ctrlr->ctrlr_regs + NVME_AQA_OFFSET);
+ /* Write ASQ */
+ writell(asq, ctrlr->ctrlr_regs + NVME_ASQ_OFFSET);
+ /* Write ACQ */
+ writell(acq, ctrlr->ctrlr_regs + NVME_ACQ_OFFSET);
+
+ /* Enable controller */
+ status = nvme_enable_controller(ctrlr);
+ if (NVME_ERROR(status))
+ goto exit;
+
+ /* Create IO queue pair */
+ status = nvme_create_cq(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iocq_sz);
+ if (NVME_ERROR(status))
+ goto exit;
+
+ status = nvme_create_sq(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
+ if (NVME_ERROR(status))
+ goto exit;
+
+ /* Identify */
+ status = nvme_identify(ctrlr);
+ if (NVME_ERROR(status))
+ goto exit;
+
+ /* Identify Namespace and create drive nodes */
+ status = nvme_identify_namespaces(ctrlr);
+ if (NVME_ERROR(status))
+ goto exit;
+
+exit:
+ ctrlr->ctrlr.need_update = 0;
+
+ return NVME_ERROR(status);
+}
+
+static int nvme_shutdown(struct CleanupFunc *cleanup, CleanupType type)
+{
+ NvmeCtrlr *ctrlr = (NvmeCtrlr *)cleanup->data;
+ NvmeDrive *drive;
+ int status = NVME_SUCCESS;
+
+ printf("Shutting down NVMe controller.\n");
+
+ if (NULL == ctrlr)
+ return 1;
+
+ /* Only disable controller if initialized */
+ if (ctrlr->ctrlr.need_update != 1) {
+ status = nvme_disable_controller(ctrlr);
+ if (NVME_ERROR(status))
+ return 1;
+ }
+
+ list_for_each(drive, ctrlr->drives, list_node) {
+ free(drive);
+ }
+ free(ctrlr->controller_data);
+ free(ctrlr->prp_list);
+ free(ctrlr->buffer);
+ free(ctrlr);
+ return 0;
+}
+
+/* Setup controller initialization/shutdown callbacks.
+ * Used in board.c to get handle to new ctrlr.
+ */
+NvmeCtrlr *new_nvme_ctrlr(pcidev_t dev)
+{
+ NvmeCtrlr *ctrlr = xzalloc(sizeof(*ctrlr));
+ static CleanupFunc cleanup = {
+ &nvme_shutdown,
+ CleanupOnHandoff | CleanupOnLegacy,
+ NULL
+ };
+
+ assert(cleanup.data == NULL);
+
+ printf("New NVMe Controller %p @ %02x:%02x:%02x\n",
+ ctrlr, PCI_BUS(dev),PCI_SLOT(dev),PCI_FUNC(dev));
+
+ ctrlr->ctrlr.ops.update = &nvme_ctrlr_init;
+ ctrlr->ctrlr.need_update = 1;
+ ctrlr->dev = dev;
+ cleanup.data = (void *)ctrlr;
+ list_insert_after(&cleanup.list_node, &cleanup_funcs);
+
+ return ctrlr;
+}
diff --git a/src/drivers/storage/nvme.h b/src/drivers/storage/nvme.h
new file mode 100644
index 0000000..dbf2580
--- /dev/null
+++ b/src/drivers/storage/nvme.h
@@ -0,0 +1,363 @@
+/*
+ * NVMe storage driver for depthcharge
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __DRIVERS_STORAGE_NVME_H__
+#define __DRIVERS_STORAGE_NVME_H__
+
+#include <pci.h>
+#include <stdint.h>
+#include <arch/barrier.h>
+#include "drivers/storage/blockdev.h"
+#include "base/list.h"
+
+//#define DEBUG_PRINTS
+#ifdef DEBUG_PRINTS
+#define DEBUG(x) x
+#else
+#define DEBUG(x)
+#endif
+
+/* BSD style bit manipulation */
+#define SET(t, f) ((t) |= (f))
+#define ISSET(t, f) ((t) & (f))
+#define CLR(t, f) ((t) &= ~(f))
+
+/* Architecture memory page size
+ * These should eventually reference the arch header definitions
+ */
+#define NVME_PAGE_SHIFT 12
+#define NVME_PAGE_SIZE (1UL << NVME_PAGE_SHIFT)
+
+/* Max 1 PRP list per transfer */
+#define MAX_PRP_LISTS 1
+/* 8 bytes per entry */
+#define PRP_ENTRY_SHIFT 3
+/* 1 page per list */
+#define PRP_LIST_SHIFT NVME_PAGE_SHIFT
+/* 1 page of memory addressed per entry*/
+#define PRP_ENTRY_XFER_SHIFT NVME_PAGE_SHIFT
+#define PRP_ENTRIES_PER_LIST (1UL << (PRP_LIST_SHIFT - PRP_ENTRY_SHIFT))
+#define NVME_MAX_XFER_BYTES ((MAX_PRP_LISTS * PRP_ENTRIES_PER_LIST ) << PRP_ENTRY_XFER_SHIFT)
+
+/* Loop used to poll for command completions
+ * timeout in milliseconds
+ */
+#define WAIT_WHILE(expr, timeout) \
+ ({ \
+ typeof(timeout) __counter = timeout * 1000; \
+ typeof(expr) __expr_val; \
+ while ((__expr_val = (expr)) && __counter--) \
+ udelay(1); \
+ __expr_val; \
+ })
+
+/* Command timeout measured in milliseconds */
+#define NVME_GENERIC_TIMEOUT 5000
+
+#define writel_with_flush(a,b) do { writel(a, b); readl(b); } while (0)
+
+typedef int NVME_STATUS;
+#define NVME_SUCCESS 0
+#define NVME_UNSUPPORTED -1
+#define NVME_DEVICE_ERROR -2
+#define NVME_OUT_OF_RESOURCES -3
+#define NVME_TIMEOUT -4
+#define NVME_INVALID_PARAMETER -5
+
+#define NVME_ERROR(err) ((err) < 0?1:0)
+
+#define PCI_CLASS_MASS_STORAGE 0x01 /* mass storage class */
+#define PCI_CLASS_MASS_STORAGE_NVM 0x08 /* mass storage sub-class non-volatile memory. */
+#define PCI_IF_NVMHCI 0x02 /* mass storage programming interface NVMHCI. */
+
+/* Queue Definitions
+ * NOTE: The size of the IO queue is tuned for max_transfer_size as
+ * a performance optimization. Smaller size saves host memory at
+ * cost of performance.
+ */
+#define NVME_ASQ_SIZE 2 /* Number of admin submission queue entries, only 2 */
+#define NVME_ACQ_SIZE 2 /* Number of admin completion queue entries, only 2 */
+
+#define NVME_CSQ_SIZE 15 /* Number of I/O submission queue entries per queue, min 2, max 64 */
+#define NVME_CCQ_SIZE 15 /* Number of I/O completion queue entries per queue, min 2, max 64 */
+
+#define NVME_NUM_QUEUES 2 /* Number of queues (Admin + IO) supported by the driver, only 2 supported */
+#define NVME_ADMIN_QUEUE_INDEX 0 /* Admin queu index must be 0 */
+#define NVME_IO_QUEUE_INDEX 1 /* IO queue */
+
+/*
+ * NVMe Controller Registers
+ */
+
+/* controller register offsets */
+#define NVME_CAP_OFFSET 0x0000 /* Controller Capabilities */
+#define NVME_VER_OFFSET 0x0008 /* Version */
+#define NVME_INTMS_OFFSET 0x000c /* Interrupt Mask Set */
+#define NVME_INTMC_OFFSET 0x0010 /* Interrupt Mask Clear */
+#define NVME_CC_OFFSET 0x0014 /* Controller Configuration */
+#define NVME_CSTS_OFFSET 0x001c /* Controller Status */
+#define NVME_AQA_OFFSET 0x0024 /* Admin Queue Attributes */
+#define NVME_ASQ_OFFSET 0x0028 /* Admin Submission Queue Base Address */
+#define NVME_ACQ_OFFSET 0x0030 /* Admin Completion Queue Base Address */
+#define NVME_SQ0_OFFSET 0x1000 /* Submission Queue 0 (admin) Tail Doorbell */
+#define NVME_CQ0_OFFSET 0x1004 /* Completion Queue 0 (admin) Head Doorbell */
+
+/* 3.1.1 Offset 00h: CAP - Controller Capabilities */
+typedef uint64_t NVME_CAP;
+#define NVME_CAP_TO(x) (500 * (((x) >> 24) & 0xff)) /* Timeout, ms (TO is in 500ms increments)*/
+#define NVME_CAP_DSTRD(x) (1 << (2 + (((x) >> 32) & 0xf))) /* Doorbell Stride, bytes */
+#define NVME_CAP_CSS(x) (((x) >> 37) & 0x7f) /* Command Set Supported */
+#define NVME_CAP_CSS_NVM (1)
+#define NVME_CAP_MPSMIN(x) (12 + (((x) >> 48) & 0xf)) /* Memory Page Size Minimum */
+#define NVME_CAP_MQES(x) (((x) & 0xffff) + 1) /* Max Queue Entries Supported per queue */
+
+/* 3.1.5 Offset 14h: CC - Controller Configuration */
+typedef uint32_t NVME_CC;
+#define NVME_CC_EN (1 << 0)
+#define NVME_CC_IOCQES(x) (((x) & 0xf) << 20)
+#define NVME_CC_IOSQES(x) (((x) & 0xf) << 16)
+
+/* 3.1.6 Offset 1Ch: CSTS - Controller Status */
+typedef uint32_t NVME_CSTS;
+#define NVME_CSTS_RDY (1 << 0)
+
+/* 3.1.8 Offset 24h: AQA - Admin Queue Attributes */
+typedef uint32_t NVME_AQA;
+#define NVME_AQA_ASQS(x) ((x) - 1)
+#define NVME_AQA_ACQS(x) (((x) - 1) << 16)
+
+/* 3.1.9 Offset 28h: ASQ - Admin Submission Queue Base Address */
+typedef uint64_t NVME_ASQ;
+
+/* 3.1.10 Offset 30h: ACQ - Admin Completion Queue Base Address */
+typedef uint64_t NVME_ACQ;
+
+/* 3.1.11 Offset (1000h + ((2y) * (DSTRD bytes)))
+ * SQyTDBL - Submission Queue y Tail Doorbell
+ */
+typedef uint32_t NVME_SQTDBL;
+
+/* 3.1.12 Offset (1000h + ((2y + 1) * (DSTRD bytes)))
+ * CQyHDBL - Completion Queue y Head Doorbell
+ */
+typedef uint32_t NVME_CQHDBL;
+
+/* These register offsets are defined as 0x1000 + (N * (DSTRD bytes))
+ * Get the doorbell stride bit shift value from the controller capabilities.
+ */
+#define NVME_SQTDBL_OFFSET(QID, DSTRD) (0x1000 + ((2 * (QID)) * (DSTRD))) /* Submission Queue y (NVM) Tail Doorbell */
+#define NVME_CQHDBL_OFFSET(QID, DSTRD) (0x1000 + (((2 * (QID)) + 1) * (DSTRD))) /* Completion Queue y (NVM) Head Doorbell */
+
+/*
+ * NVMe Command Set Types
+ */
+
+/* NVMe Admin Cmd Opcodes */
+#define NVME_ADMIN_CRIOSQ_OPC 1
+#define NVME_ADMIN_CRIOSQ_QID(x) (x)
+#define NVME_ADMIN_CRIOSQ_QSIZE(x) (((x)-1) << 16)
+#define NVME_ADMIN_CRIOSQ_CQID(x) ((x) << 16)
+
+#define NVME_ADMIN_CRIOCQ_OPC 5
+#define NVME_ADMIN_CRIOCQ_QID(x) (x)
+#define NVME_ADMIN_CRIOCQ_QSIZE(x) (((x)-1) << 16)
+
+#define NVME_ADMIN_IDENTIFY_OPC 6
+
+#define NVME_IO_FLUSH_OPC 0
+#define NVME_IO_WRITE_OPC 1
+#define NVME_IO_READ_OPC 2
+
+/* Submission Queue */
+typedef struct {
+ uint8_t opc; /* Opcode */
+ uint8_t flags; /* FUSE and PSDT, only 0 setting supported */
+ uint16_t cid; /* Command Identifier */
+ uint32_t nsid; /* Namespace Identifier */
+ uint64_t rsvd1;
+ uint64_t mptr; /* Metadata Pointer */
+ uint64_t prp[2]; /* PRP entries only, SGL not supported */
+ uint32_t cdw10;
+ uint32_t cdw11;
+ uint32_t cdw12;
+ uint32_t cdw13;
+ uint32_t cdw14;
+ uint32_t cdw15;
+} NVME_SQ;
+
+/* Completion Queue */
+typedef struct {
+ uint32_t cdw0;
+ uint32_t rsvd1;
+ uint16_t sqhd; /* Submission Queue Head Pointer */
+ uint16_t sqid; /* Submission Queue Identifier */
+ uint16_t cid; /* Command Identifier */
+ uint16_t flags;
+#define NVME_CQ_FLAGS_PHASE 0x1
+#define NVME_CQ_FLAGS_SC(x) (((x) & 0x1FE) >> 1)
+#define NVME_CQ_FLAGS_SCT(x) (((x) & 0xE00) >> 9)
+} NVME_CQ;
+
+typedef struct {
+ uint32_t power_flags; /* MP, MPS and NOPS */
+ uint32_t enlat; /* Entry Latency */
+ uint32_t exlat; /* Exit Latency */
+ uint32_t latency_flags;
+ uint8_t rsvd7[16]; /* Reserved as of Nvm Express 1.1 Spec */
+} NVME_PSDESCRIPTOR;
+
+/* Identify Controller Data */
+typedef struct {
+ /* Controller Capabilities and Features 0-255 */
+ uint16_t vid; /* PCI Vendor ID */
+ uint16_t ssvid; /* PCI sub-system vendor ID */
+ uint8_t sn[20]; /* Produce serial number */
+
+ uint8_t mn[40]; /* Proeduct model number */
+ uint8_t fr[8]; /* Firmware Revision */
+ uint8_t rab; /* Recommended Arbitration Burst */
+ uint8_t ieee_oiu[3]; /* Organization Unique Identifier */
+ uint8_t cmic; /* Multi-interface Capabilities */
+ uint8_t mdts; /* Maximum Data Transfer Size */
+ uint8_t cntlid[2]; /* Controller ID */
+ uint8_t rsvd1[176]; /* Reserved as of Nvm Express 1.1 Spec */
+ //
+ // Admin Command Set Attributes
+ //
+ uint16_t oacs; /* Optional Admin Command Support */
+ uint8_t acl; /* Abort Command Limit */
+ uint8_t aerl; /* Async Event Request Limit */
+ uint8_t frmw; /* Firmware updates */
+ uint8_t lpa; /* Log Page Attributes */
+ uint8_t elpe; /* Error Log Page Entries */
+ uint8_t npss; /* Number of Power States Support */
+ uint8_t avscc; /* Admin Vendor Specific Command Configuration */
+ uint8_t apsta; /* Autonomous Power State Transition Attributes */
+ uint8_t rsvd2[246]; /* Reserved as of Nvm Express 1.1 Spec */
+ //
+ // NVM Command Set Attributes
+ //
+ uint8_t sqes; /* Submission Queue Entry Size */
+ uint8_t cqes; /* Completion Queue Entry Size */
+ uint16_t rsvd3; /* Reserved as of Nvm Express 1.1 Spec */
+ uint32_t nn; /* Number of Namespaces */
+ uint16_t oncs; /* Optional NVM Command Support */
+ uint16_t fuses; /* Fused Operation Support */
+ uint8_t fna; /* Format NVM Attributes */
+ uint8_t vwc; /* Volatile Write Cache */
+ uint16_t awun; /* Atomic Write Unit Normal */
+ uint16_t awupf; /* Atomic Write Unit Power Fail */
+ uint8_t nvscc; /* NVM Vendor Specific Command Configuration */
+ uint8_t rsvd4; /* Reserved as of Nvm Express 1.1 Spec */
+ uint16_t acwu; /* Atomic Compare & Write Unit */
+ uint16_t rsvd5; /* Reserved as of Nvm Express 1.1 Spec */
+ uint32_t sgls; /* SGL Support */
+ uint8_t rsvd6[164]; /* Reserved as of Nvm Express 1.1 Spec */
+ //
+ // I/O Command set Attributes
+ //
+ uint8_t rsvd7[1344]; /* Reserved as of Nvm Express 1.1 Spec */
+ //
+ // Power State Descriptors
+ //
+ NVME_PSDESCRIPTOR ps_descriptor[32];
+
+ uint8_t vendor_data[1024]; /* Vendor specific data */
+} NVME_ADMIN_CONTROLLER_DATA;
+
+typedef struct {
+ uint16_t ms; /* Metadata Size */
+ uint8_t lbads; /* LBA Data Size */
+ uint8_t rp; /* Relative Performance */
+} NVME_LBAFORMAT;
+
+/* Identify Namespace Data */
+typedef struct {
+ uint64_t nsze; /* Namespace Size (total blocks in fm'd namespace) */
+ uint64_t ncap; /* Namespace Capacity (max number of logical blocks) */
+ uint64_t nuse; /* Namespace Utilization */
+ uint8_t nsfeat; /* Namespace Features */
+ uint8_t nlbaf; /* Number of LBA Formats */
+ uint8_t flbas; /* Formatted LBA size */
+ uint8_t mc; /* Metadata Capabilities */
+ uint8_t dpc; /* End-to-end Data Protection capabilities */
+ uint8_t dps; /* End-to-end Data Protection Type Settings */
+ uint8_t nmic; /* Namespace Multi-path I/O + NS Sharing Caps */
+ uint8_t rescap; /* Reservation Capabilities */
+ uint8_t rsvd1[88]; /* Reserved as of Nvm Express 1.1 Spec */
+ uint64_t eui64; /* IEEE Extended Unique Identifier */
+
+ NVME_LBAFORMAT lba_format[16];
+
+ uint8_t rsvd2[192]; /* Reserved as of Nvm Express 1.1 Spec */
+ uint8_t vendor_data[3712]; /* Vendor specific data */
+} NVME_ADMIN_NAMESPACE_DATA;
+
+typedef struct PrpList {
+ uint64_t prp_entry[PRP_ENTRIES_PER_LIST];
+} PrpList;
+
+/*
+ * Driver Types
+ */
+typedef struct NvmeCtrlr {
+ BlockDevCtrlr ctrlr;
+ ListNode drives;
+
+ pcidev_t dev;
+ uint32_t ctrlr_regs;
+
+ /* local copy of controller CAP register */
+ NVME_CAP cap;
+
+ /* virtual address of identify controller data */
+ NVME_ADMIN_CONTROLLER_DATA *controller_data;
+
+ /* virtual address of pre-allocated PRP Lists */
+ PrpList *prp_list[NVME_CSQ_SIZE];
+
+ /* virtual address of raw buffer, split into queues below */
+ uint8_t *buffer;
+ /* virtual addresses of queue buffers */
+ NVME_SQ *sq_buffer[NVME_NUM_QUEUES];
+ NVME_CQ *cq_buffer[NVME_NUM_QUEUES];
+
+ NVME_SQTDBL sq_t_dbl[NVME_NUM_QUEUES];
+ NVME_CQHDBL cq_h_dbl[NVME_NUM_QUEUES];
+
+ /* current phase of each queue */
+ uint8_t pt[NVME_NUM_QUEUES];
+ /* sq head index as of most recent completion */
+ uint16_t sqhd[NVME_NUM_QUEUES];
+ /* current command id for each queue */
+ uint16_t cid[NVME_NUM_QUEUES];
+
+ /* Actual IO SQ size accounting for MQES */
+ uint16_t iosq_sz;
+ /* Actual IO CQ size accounting for MQES*/
+ uint16_t iocq_sz;
+} NvmeCtrlr;
+
+typedef struct NvmeDrive {
+ BlockDev dev;
+
+ NvmeCtrlr *ctrlr;
+ uint32_t namespace_id;
+
+ ListNode list_node;
+} NvmeDrive;
+
+NvmeCtrlr *new_nvme_ctrlr(pcidev_t dev);
+
+#endif /* __DRIVERS_STORAGE_NVME_H__ */