blob: 41bb86212a68e866bcd900ee0c99eb31a3d98841 [file] [log] [blame]
/*
* NVMe storage driver for depthcharge
* Copyright (c) 2015, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
/* Documentation:
* This driver implements a minimal subset of the NVMe 1.0e specification.
* (nvmexpress.org) It is designed to balance simplicity and performance.
* Therefore it operates by polling the NVMe Completion Queue (CQ) for phase
* changes rather than utilizing interrupts. The initialization functions are
* processed one at a time, therefore the Admin Queue pair only supports depth
* 2.
* This driver is limited to a single IO queue pair (in addition to the
* mandatory Admin queue pair). The IO queue depth is configurable, but has
* shallow defaults to minimize host memory consumption. This driver only
* supports a maximum of one PRP List, limiting the maximum transfer size to
* 2MB (assuming 4KB memory pages).
*
* Operation:
* At initialization this driver allocates a pool of host memory and overlays
* the queue pair structures. It also statically allocates a block of memory
* for a PRP List, avoiding the need to allocate/free memory at IO time.
* Each identified NVMe namespace has a corresponding depthcharge BlockDev
* structure, effectively creating a new "drive" visible to higher levels.
*
* The depthcharge read/write callbacks split host requests into chunks
* satisfying the NVMe device's maximum transfer size limitations. Then they
* call the corresponding _internal_ functions to facilitate formatting of the
* NVMe structures in host memory. After all of the commands have been created
* in host memory the Submission Queue tail pointer is updated allowing the
* drive to process the newly submitted commands. Queuing commands allows the
* drive to internally optimize accesses, increasing performance. Finally, the
* Completion Queue phase bit is polled until it inverts, indicating that the
* command has completed. If the SQ is full, outstanding commands will be
* completed before the _internal_ function proceeds. This situation reduces
* effective performance and should be avoided by increasing SQ/CQ depth.
*/
#include <assert.h>
#include <endian.h>
#include <libpayload.h>
#include <stdint.h>
#include <stdio.h>
#include "base/cleanup_funcs.h"
#include "drivers/storage/blockdev.h"
#include "drivers/storage/nvme.h"
/* Read 64bits from register space */
static uint64_t readll(uintptr_t _a)
{
uint64_t _v;
uint32_t *v = (uint32_t *)&_v;
v[0] = readl(_a);
v[1] = readl(_a + sizeof(uint32_t));
return le64toh(_v);
}
/* Write 64bits to register space */
static void writell(uint64_t _v, volatile const uintptr_t _a)
{
uint32_t *v = (uint32_t *)&_v;
_v = htole64(_v);
writel(v[0], _a);
writel(v[1], _a + sizeof(uint32_t));
}
DEBUG(
static void nvme_dump_status(NVME_CQ volatile *cq) {
printf("Dump NVMe Completion Entry Status from [%p]:\n", cq);
printf(" SQ ID : [0x%x], Phase Tag : [%d], Cmd ID : [0x%x] Flags : [0x%x]\n",
cq->sqid, cq->flags & NVME_CQ_FLAGS_PHASE, cq->cid, cq->flags);
if (NVME_CQ_FLAGS_SCT(cq->flags) == 0) {
if (NVME_CQ_FLAGS_SC(cq->flags) == 0)
printf(" NVMe Cmd Execution Result - Successful\n");
else
printf(" NVMe Cmd Execution Result - error sc=%u\n",NVME_CQ_FLAGS_SC(cq->flags));
} else
printf(" NVMe Cmd Execution Result - error sct=%u\n",NVME_CQ_FLAGS_SCT(cq->flags));
}
) //DEBUG
/* Disables and resets the NVMe controller */
static NVME_STATUS nvme_disable_controller(NvmeCtrlr *ctrlr) {
NVME_CC cc;
uint8_t timeout;
/* Read controller configuration */
cc = readl(ctrlr->ctrlr_regs + NVME_CC_OFFSET);
CLR(cc, NVME_CC_EN);
/* Write controller configuration */
writel_with_flush(cc, ctrlr->ctrlr_regs + NVME_CC_OFFSET);
/* Delay up to CAP.TO ms for CSTS.RDY to clear*/
if (NVME_CAP_TO(ctrlr->cap) == 0)
timeout = 1;
else
timeout = NVME_CAP_TO(ctrlr->cap);
if (WAIT_WHILE(
((readl(ctrlr->ctrlr_regs + NVME_CSTS_OFFSET) & NVME_CSTS_RDY) == 1),
timeout)) {
return NVME_TIMEOUT;
}
return NVME_SUCCESS;
}
/* Enables controller and verifies that it's ready */
static NVME_STATUS nvme_enable_controller(NvmeCtrlr *ctrlr) {
NVME_CC cc = 0;
uint8_t timeout;
SET(cc, NVME_CC_EN);
cc |= NVME_CC_IOSQES(6); /* Spec. recommended values */
cc |= NVME_CC_IOCQES(4); /* Spec. recommended values */
/* Write controller configuration. */
writel_with_flush(cc, ctrlr->ctrlr_regs + NVME_CC_OFFSET);
/* Delay up to CAP.TO ms for CSTS.RDY to set*/
if (NVME_CAP_TO(ctrlr->cap) == 0)
timeout = 1;
else
timeout = NVME_CAP_TO(ctrlr->cap);
if (WAIT_WHILE(
((readl(ctrlr->ctrlr_regs + NVME_CSTS_OFFSET) & NVME_CSTS_RDY) == 0),
timeout)) {
return NVME_TIMEOUT;
}
return NVME_SUCCESS;
}
/* Submit and complete 1 command by polling CQ for phase change
* Rings SQ doorbell, polls waiting for completion, rings CQ doorbell
*
* ctrlr: NVMe controller handle
* qid: Queue Identifier for the SQ/CQ containing the new command
* sqsize: Number of commands (size) of the submission queue
* cqsize: Number of commands (size) of the completion queue
* timeout_ms: How long in milliseconds to wait for command completion
*/
static NVME_STATUS nvme_submit_cmd_polled(NvmeCtrlr *ctrlr,
uint16_t qid,
uint32_t sqsize,
uint32_t cqsize,
uint32_t timeout_ms) {
NVME_CQ *cq;
if (NULL == ctrlr)
return NVME_INVALID_PARAMETER;
if (qid > (NVME_NUM_QUEUES - 1))
return NVME_INVALID_PARAMETER;
if (timeout_ms == 0)
timeout_ms = 1;
cq = ctrlr->cq_buffer[qid] + ctrlr->cq_h_dbl[qid];
/* Update SQ tail index in host memory */
if (++(ctrlr->sq_t_dbl[qid]) > (sqsize-1))
ctrlr->sq_t_dbl[qid] = 0;
/* Ring the submission queue doorbell */
writel_with_flush(ctrlr->sq_t_dbl[qid],
ctrlr->ctrlr_regs +
NVME_SQTDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
/* Wait for phase to change (or timeout) */
if (WAIT_WHILE(
((readw(&(cq->flags)) & NVME_CQ_FLAGS_PHASE) == ctrlr->pt[qid]),
timeout_ms)) {
printf("nvme_submit_cmd_polled: ERROR - timeout\n");
return NVME_TIMEOUT;
}
/* Dump completion entry status for debugging. */
DEBUG(nvme_dump_status(cq);)
/* Update the completion queue head index, queue phase if necessary */
if (++(ctrlr->cq_h_dbl[qid]) > (cqsize-1)) {
ctrlr->cq_h_dbl[qid] = 0;
ctrlr->pt[qid] ^= 1;
}
/* Update SQ head pointer */
ctrlr->sqhd[qid] = cq->sqhd;
/* Ring the CQ doorbell */
writel_with_flush(ctrlr->cq_h_dbl[qid],
ctrlr->ctrlr_regs +
NVME_CQHDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
return NVME_SUCCESS;
}
/* Add command to host SQ, don't write to HW SQ yet
*
* ctrlr: NVMe controller handle
* qid: Queue Identifier for the SQ/CQ containing the new command
* sqsize: Size of the submission queue
*/
static NVME_STATUS nvme_submit_cmd(NvmeCtrlr *ctrlr, uint16_t qid, uint32_t sqsize) {
if (NULL == ctrlr)
return NVME_INVALID_PARAMETER;
if (qid > (NVME_NUM_QUEUES - 1))
return NVME_INVALID_PARAMETER;
/* Update the submission queue tail in host memory */
if (++(ctrlr->sq_t_dbl[qid]) > (sqsize-1))
ctrlr->sq_t_dbl[qid] = 0;
return NVME_SUCCESS;
}
/* Ring SQ doorbell register, submitting all outstanding command to HW
*
* ctrlr: NVMe controller handle
* qid: Queue Identifier for the SQ/CQ containing the new command
*/
static NVME_STATUS nvme_ring_sq_doorbell(NvmeCtrlr *ctrlr, uint16_t qid) {
if (NULL == ctrlr)
return NVME_INVALID_PARAMETER;
if (qid > (NVME_NUM_QUEUES - 1))
return NVME_INVALID_PARAMETER;
/* Ring SQ doorbell by writing SQ tail index to controller */
writel_with_flush(ctrlr->sq_t_dbl[qid],
ctrlr->ctrlr_regs +
NVME_SQTDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
return NVME_SUCCESS;
}
/* Poll for completion of all commands from HW
*
* ctrlr: NVMe controller handle
* qid: Queue Identifier for the SQ/CQ containing the new command
* cqsize: Size of the completion queue
* timeout_ms: How long in milliseconds to wait for command completion
*/
static NVME_STATUS nvme_complete_cmds_polled(NvmeCtrlr *ctrlr,
uint16_t qid,
uint32_t cqsize,
uint32_t timeout_ms) {
NVME_CQ *cq;
uint32_t ncmds;
if (NULL == ctrlr)
return NVME_INVALID_PARAMETER;
if (qid > (NVME_NUM_QUEUES - 1))
return NVME_INVALID_PARAMETER;
if (timeout_ms == 0)
timeout_ms = 1;
/* We will complete all outstanding commands */
if (ctrlr->cq_h_dbl[qid] < ctrlr->sq_t_dbl[qid])
ncmds = ctrlr->sq_t_dbl[qid] - ctrlr->cq_h_dbl[qid];
else
ncmds = (cqsize - ctrlr->cq_h_dbl[qid]) + ctrlr->sq_t_dbl[qid];
DEBUG(printf("nvme_complete_cmds_polled: completing %u commands\n",(unsigned)ncmds);)
while (ncmds--) {
cq = ctrlr->cq_buffer[qid] + ctrlr->cq_h_dbl[qid];
/* Wait for phase to change (or timeout) */
if (WAIT_WHILE(
((readw(&(cq->flags)) & NVME_CQ_FLAGS_PHASE) == ctrlr->pt[qid]),
timeout_ms)) {
printf("nvme_complete_cmds_polled: ERROR - timeout\n");
return NVME_TIMEOUT;
}
/* Dump completion entry status for debugging. */
DEBUG(nvme_dump_status(cq);)
/* Update the doorbell, queue phase, and queue command id if necessary */
if (++(ctrlr->cq_h_dbl[qid]) > (cqsize-1)) {
ctrlr->cq_h_dbl[qid] = 0;
ctrlr->pt[qid] ^= 1;
ctrlr->cid[qid] = 0;
}
/* Update SQ head pointer */
ctrlr->sqhd[qid] = cq->sqhd;
}
/* Ring the completion queue doorbell register*/
writel_with_flush(ctrlr->cq_h_dbl[qid], ctrlr->ctrlr_regs + NVME_CQHDBL_OFFSET(qid, NVME_CAP_DSTRD(ctrlr->cap)));
return NVME_SUCCESS;
}
/* Creates a single IO completion queue */
static NVME_STATUS nvme_create_cq(NvmeCtrlr *ctrlr, uint16_t qid, uint16_t qsize) {
NVME_SQ *sq;
int status = NVME_SUCCESS;
sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
memset(sq, 0, sizeof(NVME_SQ));
sq->opc = NVME_ADMIN_CRIOCQ_OPC;
sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
/* Only physically contiguous address supported */
sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->cq_buffer[qid]);
/* Set physically contiguous (PC) bit */
sq->cdw11 = 1;
sq->cdw10 |= NVME_ADMIN_CRIOCQ_QID(qid);
sq->cdw10 |= NVME_ADMIN_CRIOCQ_QSIZE(qsize);
status = nvme_submit_cmd_polled(ctrlr,
NVME_ADMIN_QUEUE_INDEX,
NVME_ASQ_SIZE,
NVME_ACQ_SIZE,
NVME_GENERIC_TIMEOUT);
return status;
}
/* Creates a single IO submission queue
* NOTE: Assumes that completion queue ID == submission queue ID
*/
static NVME_STATUS nvme_create_sq(NvmeCtrlr *ctrlr, uint16_t qid, uint16_t qsize) {
NVME_SQ *sq;
int status = NVME_SUCCESS;
sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
memset(sq, 0, sizeof(NVME_SQ));
sq->opc = NVME_ADMIN_CRIOSQ_OPC;
sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
/* Only physically contiguous address supported */
sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->sq_buffer[qid]);
/* Set physically contiguous (PC) bit */
sq->cdw11 = 1;
sq->cdw11 |= NVME_ADMIN_CRIOSQ_CQID(qid);
sq->cdw10 |= NVME_ADMIN_CRIOSQ_QID(qid);
sq->cdw10 |= NVME_ADMIN_CRIOSQ_QSIZE(qsize);
status = nvme_submit_cmd_polled(ctrlr,
NVME_ADMIN_QUEUE_INDEX,
NVME_ASQ_SIZE,
NVME_ACQ_SIZE,
NVME_GENERIC_TIMEOUT);
return status;
}
/* Generate PRPs for a single virtual memory buffer
* prp_list: pre-allocated prp list buffer
* prp: pointer to SQ PRP array
* buffer: host buffer for request
* size: number of bytes in request
*/
static NVME_STATUS nvme_fill_prp(PrpList *prp_list, uint64_t *prp, void *buffer, uint64_t size)
{
uint64_t offset = (uintptr_t)buffer & (NVME_PAGE_SIZE - 1);
uint64_t xfer_pages;
uintptr_t buffer_phys = virt_to_phys(buffer);
/* PRP0 is always the (potentially unaligned) start of the buffer */
prp[0] = buffer_phys;
/* Increment buffer to the next aligned page */
if (ALIGN(buffer_phys,NVME_PAGE_SIZE) == buffer_phys)
buffer_phys += NVME_PAGE_SIZE;
else
buffer_phys = ALIGN_UP(buffer_phys,NVME_PAGE_SIZE);
/* Case 1: all data will fit in 2 PRP entries (accounting for buffer offset) */
if ((size + offset) <= (2 * NVME_PAGE_SIZE)) {
prp[1] = buffer_phys;
return NVME_SUCCESS;
}
/* Case 2: Need to build up to one PRP List */
xfer_pages = (ALIGN((size + offset), NVME_PAGE_SIZE) >> NVME_PAGE_SHIFT);
/* Don't count first prp entry as it is the beginning of buffer */
xfer_pages--;
/* Make sure this transfer fits into one PRP list */
if (xfer_pages > (NVME_MAX_XFER_BYTES/NVME_PAGE_SIZE))
return NVME_INVALID_PARAMETER;
/* Fill the PRP List */
prp[1] = (uintptr_t)virt_to_phys(prp_list);
for (uint32_t entry_index = 0; entry_index < xfer_pages; entry_index++) {
prp_list->prp_entry[entry_index] = buffer_phys;
buffer_phys += NVME_PAGE_SIZE;
}
return NVME_SUCCESS;
}
/* Sets up read operation for up to max_transfer blocks */
static NVME_STATUS nvme_internal_read(NvmeDrive *drive, void *buffer, lba_t start, lba_t count)
{
NvmeCtrlr *ctrlr = drive->ctrlr;
NVME_SQ *sq;
int status = NVME_SUCCESS;
if (count == 0)
return NVME_INVALID_PARAMETER;
/* If queue is full, need to complete inflight commands before submitting more */
if ((ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX] + 1) % ctrlr->iosq_sz == ctrlr->sqhd[NVME_IO_QUEUE_INDEX]) {
DEBUG(printf("nvme_internal_read: Too many outstanding commands. Completing in-flights\n");)
/* Submit commands to controller */
nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
/* Complete submitted command(s) */
status = nvme_complete_cmds_polled(ctrlr,
NVME_IO_QUEUE_INDEX,
NVME_CCQ_SIZE,
NVME_GENERIC_TIMEOUT);
if (NVME_ERROR(status)) {
printf("nvme_internal_read: error %d completing outstanding commands\n",status);
return status;
}
}
sq = ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX];
memset(sq, 0, sizeof(NVME_SQ));
sq->opc = NVME_IO_READ_OPC;
sq->cid = ctrlr->cid[NVME_IO_QUEUE_INDEX]++;
sq->nsid = drive->namespace_id;
status = nvme_fill_prp(ctrlr->prp_list[sq->cid], sq->prp, buffer, count * drive->dev.block_size);
if (NVME_ERROR(status)) {
printf("nvme_internal_read: error %d generating PRP(s)\n",status);
return status;
}
sq->cdw10 = start;
sq->cdw11 = (start >> 32);
sq->cdw12 = (count - 1) & 0xFFFF;
status = nvme_submit_cmd(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
return status;
}
/* Read operation entrypoint
* Cut operation into max_transfer chunks and do it
*/
static lba_t nvme_read(BlockDevOps *me, lba_t start, lba_t count, void *buffer)
{
NvmeDrive *drive = container_of(me, NvmeDrive, dev.ops);
NvmeCtrlr *ctrlr = drive->ctrlr;
uint64_t max_transfer_blocks = 0;
uint32_t block_size = drive->dev.block_size;
lba_t orig_count = count;
int status = NVME_SUCCESS;
DEBUG(printf("nvme_read: Reading from namespace %d\n",drive->namespace_id);)
if (ctrlr->controller_data->mdts != 0)
max_transfer_blocks = ((1 << (ctrlr->controller_data->mdts)) * (1 << NVME_CAP_MPSMIN(ctrlr->cap))) / block_size;
/* Artificially limit max_transfer_blocks to 1 PRP List */
if ( (max_transfer_blocks == 0) ||
(max_transfer_blocks > (NVME_MAX_XFER_BYTES * block_size)))
max_transfer_blocks = NVME_MAX_XFER_BYTES / block_size;
while (count > 0) {
if (count > max_transfer_blocks) {
DEBUG(printf("nvme_read: partial read of %llu blocks\n",(unsigned long long)max_transfer_blocks);)
status = nvme_internal_read(drive, buffer, start, max_transfer_blocks);
count -= max_transfer_blocks;
buffer += max_transfer_blocks*block_size;
start += max_transfer_blocks;
} else {
DEBUG(printf("nvme_read: final read of %llu blocks\n",(unsigned long long)count);)
status = nvme_internal_read(drive, buffer, start, count);
count = 0;
}
if (NVME_ERROR(status))
break;
}
if (NVME_ERROR(status)) {
printf("nvme_read: error %d\n",status);
return -1;
}
/* Submit commands to controller */
nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
/* Complete submitted command(s) */
nvme_complete_cmds_polled(ctrlr,
NVME_IO_QUEUE_INDEX,
NVME_CCQ_SIZE,
NVME_GENERIC_TIMEOUT);
DEBUG(printf("nvme_read: lba = 0x%08x, Original = 0x%08x, Remaining = 0x%08x, BlockSize = 0x%x Status = %d\n", (uint32_t)start, (uint32_t)orig_count, (uint32_t)count, block_size, status);)
return orig_count - count;
}
/* Sets up write operation for up to max_transfer blocks */
static NVME_STATUS nvme_internal_write(NvmeDrive *drive, void *buffer, lba_t start, lba_t count)
{
NvmeCtrlr *ctrlr = drive->ctrlr;
NVME_SQ *sq;
int status = NVME_SUCCESS;
if (count == 0)
return NVME_INVALID_PARAMETER;
/* If queue is full, need to complete inflight commands before submitting more */
if ((ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX] + 1) % ctrlr->iosq_sz == ctrlr->sqhd[NVME_IO_QUEUE_INDEX]) {
DEBUG(printf("nvme_internal_write: Too many outstanding commands. Completing in-flights\n");)
/* Submit commands to controller */
nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
/* Complete submitted command(s) */
status = nvme_complete_cmds_polled(ctrlr,
NVME_IO_QUEUE_INDEX,
NVME_CCQ_SIZE,
NVME_GENERIC_TIMEOUT);
if (NVME_ERROR(status)) {
printf("nvme_internal_read: error %d completing outstanding commands\n",status);
return status;
}
}
sq = ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_IO_QUEUE_INDEX];
memset(sq, 0, sizeof(NVME_SQ));
sq->opc = NVME_IO_WRITE_OPC;
sq->cid = ctrlr->cid[NVME_IO_QUEUE_INDEX]++;
sq->nsid = drive->namespace_id;
status = nvme_fill_prp(ctrlr->prp_list[sq->cid], sq->prp, buffer, count * drive->dev.block_size);
if (NVME_ERROR(status)) {
printf("nvme_internal_write: error %d generating PRP(s)\n",status);
return status;
}
sq->cdw10 = start;
sq->cdw11 = (start >> 32);
sq->cdw12 = (count - 1) & 0xFFFF;
status = nvme_submit_cmd(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
return status;
}
/* Write operation entrypoint
* Cut operation into max_transfer chunks and do it
*/
static lba_t nvme_write(BlockDevOps *me, lba_t start, lba_t count,
const void *buffer)
{
NvmeDrive *drive = container_of(me, NvmeDrive, dev.ops);
NvmeCtrlr *ctrlr = drive->ctrlr;
uint64_t max_transfer_blocks = 0;
uint32_t block_size = drive->dev.block_size;
lba_t orig_count = count;
int status = NVME_SUCCESS;
DEBUG(printf("nvme_write: Writing to namespace %d\n",drive->namespace_id);)
if (ctrlr->controller_data->mdts != 0)
max_transfer_blocks = ((1 << (ctrlr->controller_data->mdts)) * (1 << NVME_CAP_MPSMIN(ctrlr->cap))) / block_size;
/* Artificially limit max_transfer_blocks to 1 PRP List */
if ( (max_transfer_blocks == 0) ||
(max_transfer_blocks > (NVME_MAX_XFER_BYTES * block_size)))
max_transfer_blocks = NVME_MAX_XFER_BYTES / block_size;
while (count > 0) {
if (count > max_transfer_blocks) {
DEBUG(printf("nvme_write: partial write of %llu blocks\n",(unsigned long long)max_transfer_blocks);)
status = nvme_internal_write(drive, (void *)buffer, start, max_transfer_blocks);
count -= max_transfer_blocks;
buffer += max_transfer_blocks*block_size;
start += max_transfer_blocks;
} else {
DEBUG(printf("nvme_write final write of %llu blocks\n",(unsigned long long)count);)
status = nvme_internal_write(drive, (void *)buffer, start, count);
count = 0;
}
if (NVME_ERROR(status))
break;
}
if (NVME_ERROR(status)) {
printf("nvme_write: error %d\n",status);
return -1;
}
/* Submit commands to controller */
nvme_ring_sq_doorbell(ctrlr, NVME_IO_QUEUE_INDEX);
/* Complete submitted command(s) */
nvme_complete_cmds_polled(ctrlr,
NVME_IO_QUEUE_INDEX,
NVME_CCQ_SIZE,
NVME_GENERIC_TIMEOUT);
DEBUG(printf("nvme_write: lba = 0x%08x, Original = 0x%08x, Remaining = 0x%08x, BlockSize = 0x%x Status = %d\n", (uint32_t)start, (uint32_t)orig_count, (uint32_t)count, block_size, status);)
return orig_count - count;
}
/* Sends the Identify command, saves result in ctrlr->controller_data*/
static NVME_STATUS nvme_identify(NvmeCtrlr *ctrlr) {
NVME_SQ *sq;
int status = NVME_SUCCESS;
ctrlr->controller_data = dma_memalign(NVME_PAGE_SIZE, sizeof(NVME_ADMIN_CONTROLLER_DATA));
if (ctrlr->controller_data == NULL) {
printf("nvme_identify: ERROR - out of memory\n");
return NVME_OUT_OF_RESOURCES;
}
sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
memset(sq, 0, sizeof(NVME_SQ));
sq->opc = NVME_ADMIN_IDENTIFY_OPC;
sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
/* Identify structure is 4Kb in size. Fits in aligned 1 PAGE */
sq->prp[0] = (uintptr_t)virt_to_phys(ctrlr->controller_data);
/* Set bit 0 (Cns bit) to 1 to identify a controller */
sq->cdw10 = 1;
status = nvme_submit_cmd_polled(ctrlr,
NVME_ADMIN_QUEUE_INDEX,
NVME_ASQ_SIZE,
NVME_ACQ_SIZE,
NVME_GENERIC_TIMEOUT);
if (NVME_ERROR(status))
return status;
ctrlr->controller_data->sn[19] = 0;
ctrlr->controller_data->mn[39] = 0;
DEBUG(printf(" == NVME IDENTIFY CONTROLLER DATA ==\n");)
DEBUG(printf(" PCI VID : 0x%x\n", ctrlr->controller_data->vid);)
DEBUG(printf(" PCI SSVID : 0x%x\n", ctrlr->controller_data->ssvid);)
DEBUG(printf(" SN : %s\n", (char *)(ctrlr->controller_data->sn));)
DEBUG(printf(" MN : %s\n", (char *)(ctrlr->controller_data->mn));)
DEBUG(printf(" RAB : 0x%x\n", ctrlr->controller_data->rab);)
DEBUG(printf(" AERL : 0x%x\n", ctrlr->controller_data->aerl);)
DEBUG(printf(" SQES : 0x%x\n", ctrlr->controller_data->sqes);)
DEBUG(printf(" CQES : 0x%x\n", ctrlr->controller_data->cqes);)
DEBUG(printf(" NN : 0x%x\n", ctrlr->controller_data->nn);)
return status;
}
/* Sends the Identify Namespace command, creates NvmeDrives for each namespace */
static NVME_STATUS nvme_identify_namespaces(NvmeCtrlr *ctrlr) {
NVME_SQ *sq;
NVME_ADMIN_NAMESPACE_DATA *namespace_data = NULL;
int status = NVME_SUCCESS;
if (ctrlr->controller_data == NULL) {
printf("nvme_identify_namespaces: ERROR - must complete Identify command first\n");
return NVME_INVALID_PARAMETER;
}
namespace_data = dma_memalign(NVME_PAGE_SIZE, sizeof(NVME_ADMIN_NAMESPACE_DATA));
if (namespace_data == NULL) {
printf("nvme_identify_namespaces: ERROR - out of memory\n");
return NVME_OUT_OF_RESOURCES;
}
for (uint32_t index = 1; index <= ctrlr->controller_data->nn; index++) {
DEBUG(printf("nvme_identify_namespaces: Working on namespace %d\n",index);)
sq = ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] + ctrlr->sq_t_dbl[NVME_ADMIN_QUEUE_INDEX];
memset(sq, 0, sizeof(NVME_SQ));
sq->opc = NVME_ADMIN_IDENTIFY_OPC;
sq->cid = ctrlr->cid[NVME_ADMIN_QUEUE_INDEX]++;
sq->nsid = index;
/* Identify structure is 4Kb in size. Fits in 1 aligned PAGE */
sq->prp[0] = (uintptr_t)virt_to_phys(namespace_data);
/* Clear bit 0 (Cns bit) to identify a namespace */
status = nvme_submit_cmd_polled(ctrlr,
NVME_ADMIN_QUEUE_INDEX,
NVME_ASQ_SIZE,
NVME_ACQ_SIZE,
NVME_GENERIC_TIMEOUT);
if (NVME_ERROR(status))
goto exit;
DEBUG(printf(" == NVME IDENTIFY NAMESPACE [%d] DATA ==\n", index);)
DEBUG(printf(" NSZE : 0x%llx\n", namespace_data->nsze);)
DEBUG(printf(" NCAP : 0x%llx\n", namespace_data->ncap);)
DEBUG(printf(" NUSE : 0x%llx\n", namespace_data->nuse);)
DEBUG(printf(" LBAF0.LBADS : 0x%x\n", (namespace_data->lba_format[0].lbads));)
if (namespace_data->ncap == 0) {
printf("nvme_identify_namespaces: ERROR - namespace %d has zero capacity\n", index);
status = NVME_DEVICE_ERROR;
goto exit;
} else {
/* Create drive node. */
NvmeDrive *nvme_drive = xzalloc(sizeof(*nvme_drive));
static const int name_size = 21;
char *name = xmalloc(name_size);
snprintf(name, name_size, "NVMe Namespace %d", index);
nvme_drive->dev.ops.read = &nvme_read;
nvme_drive->dev.ops.write = &nvme_write;
nvme_drive->dev.ops.new_stream = &new_simple_stream;
nvme_drive->dev.name = name;
nvme_drive->dev.removable = 0;
nvme_drive->dev.block_size = 2 << (namespace_data->lba_format[namespace_data->flbas & 0xF].lbads - 1);
nvme_drive->dev.block_count = namespace_data->nsze;
nvme_drive->ctrlr = ctrlr;
nvme_drive->namespace_id = index;
list_insert_after(&nvme_drive->dev.list_node,
&fixed_block_devices);
list_insert_after(&nvme_drive->list_node, &ctrlr->drives);
printf("Added NVMe drive \"%s\" lbasize:%d, count:0x%llx\n", nvme_drive->dev.name, nvme_drive->dev.block_size, (uint64_t)nvme_drive->dev.block_count);
}
}
exit:
if (namespace_data != NULL)
free(namespace_data);
return status;
}
/* Initialization entrypoint */
static int nvme_ctrlr_init(BlockDevCtrlrOps *me)
{
NvmeCtrlr *ctrlr = container_of(me, NvmeCtrlr, ctrlr.ops);
pcidev_t dev = ctrlr->dev;
int status = NVME_SUCCESS;
if ((pci_read_config8(ctrlr->dev, REG_PROG_IF) != PCI_IF_NVMHCI)
|| (pci_read_config8(ctrlr->dev, REG_SUBCLASS) != PCI_CLASS_MASS_STORAGE_NVM)
|| (pci_read_config8(ctrlr->dev, REG_CLASS) != PCI_CLASS_MASS_STORAGE)) {
printf("Unsupported NVMe controller found\n");
status = NVME_UNSUPPORTED;
goto exit;
}
printf("Initializing NVMe controller %04x:%04x\n",
pci_read_config16(ctrlr->dev, REG_VENDOR_ID),
pci_read_config16(ctrlr->dev, REG_DEVICE_ID));
pci_set_bus_master(dev);
/* Read the Controller Capabilities register */
ctrlr->ctrlr_regs = pci_read_resource(dev,0);
ctrlr->ctrlr_regs = ctrlr->ctrlr_regs & ~0x7;
ctrlr->cap = readll(ctrlr->ctrlr_regs + NVME_CAP_OFFSET);
/* Verify that the NVM command set is supported */
if (NVME_CAP_CSS(ctrlr->cap) != NVME_CAP_CSS_NVM) {
printf("NVMe Cap CSS not NVMe (CSS=%01x. Unsupported controller.\n",(uint8_t)NVME_CAP_CSS(ctrlr->cap));
status = NVME_UNSUPPORTED;
goto exit;
}
/* Driver only supports 4k page size */
if (NVME_CAP_MPSMIN(ctrlr->cap) > NVME_PAGE_SHIFT) {
printf("NVMe driver only supports 4k page size. Unsupported controller.\n");
status = NVME_UNSUPPORTED;
goto exit;
}
/* Calculate max io sq/cq sizes based on MQES */
ctrlr->iosq_sz = (NVME_CSQ_SIZE > NVME_CAP_MQES(ctrlr->cap)) ? NVME_CAP_MQES(ctrlr->cap) : NVME_CSQ_SIZE;
ctrlr->iocq_sz = (NVME_CCQ_SIZE > NVME_CAP_MQES(ctrlr->cap)) ? NVME_CAP_MQES(ctrlr->cap) : NVME_CCQ_SIZE;
DEBUG(printf("iosq_sz = %u, iocq_sz = %u\n",ctrlr->iosq_sz,ctrlr->iocq_sz);)
/* Allocate enough PRP List memory for max queue depth commands */
for (unsigned int list_index = 0; list_index < ctrlr->iosq_sz; list_index++) {
ctrlr->prp_list[list_index] = dma_memalign(NVME_PAGE_SIZE, NVME_PAGE_SIZE);
if (!(ctrlr->prp_list[list_index])) {
printf("NVMe driver failed to allocate prp list %u memory\n",list_index);
status = NVME_OUT_OF_RESOURCES;
goto exit;
}
memset(ctrlr->prp_list[list_index], 0, NVME_PAGE_SIZE);
}
/* Allocate queue memory block */
ctrlr->buffer = dma_memalign(NVME_PAGE_SIZE, (NVME_NUM_QUEUES * 2) * NVME_PAGE_SIZE);
if (!(ctrlr->buffer)) {
printf("NVMe driver failed to allocate queue buffer\n");
status = NVME_OUT_OF_RESOURCES;
goto exit;
}
memset(ctrlr->buffer, 0, (NVME_NUM_QUEUES * 2) * NVME_PAGE_SIZE);
/* Disable controller */
status = nvme_disable_controller(ctrlr);
if (NVME_ERROR(status))
goto exit;
/* Create Admin queue pair */
NVME_AQA aqa = 0;
NVME_ASQ asq = 0;
NVME_ACQ acq = 0;
/* Verify defined queue sizes are within NVME_PAGE_SIZE limits */
#if NVME_ASQ_SIZE != 2
#error "Unsupported Admin SQ size defined"
#endif
#if NVME_ACQ_SIZE != 2
#error "Unsupported Admin CQ size defined"
#endif
#if (NVME_CSQ_SIZE < 2) || (NVME_CSQ_SIZE > (NVME_PAGE_SIZE / 64))
#error "Unsupported IO SQ size defined"
#endif
#if (NVME_CCQ_SIZE < 2) || (NVME_CCQ_SIZE > (NVME_PAGE_SIZE / 64))
#error "Unsupported IO CQ size defined"
#endif
/* Set number of entries Admin submission & completion queues. */
aqa |= NVME_AQA_ASQS(NVME_ASQ_SIZE);
aqa |= NVME_AQA_ACQS(NVME_ACQ_SIZE);
/* Address of Admin submission queue. */
asq = (uintptr_t)virt_to_phys(ctrlr->buffer);
ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX] = (NVME_SQ *)ctrlr->buffer;
/* Address of Admin completion queue. */
acq = (uintptr_t)virt_to_phys(ctrlr->buffer + NVME_PAGE_SIZE);
ctrlr->cq_buffer[NVME_ADMIN_QUEUE_INDEX] = (NVME_CQ *)(ctrlr->buffer + NVME_PAGE_SIZE);
/* Address of I/O submission & completion queues */
ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX] =
(NVME_SQ *)(ctrlr->buffer + 2 * NVME_PAGE_SIZE);
ctrlr->cq_buffer[NVME_IO_QUEUE_INDEX] =
(NVME_CQ *)(ctrlr->buffer + 3 * NVME_PAGE_SIZE);
DEBUG(printf("Private->Buffer = [%p]\n", (void *)virt_to_phys(ctrlr->buffer));)
DEBUG(printf("Admin Queue Attributes = [%X]\n", aqa);)
DEBUG(printf("Admin Submission Queue (sq_buffer[ADMIN]) = [%p]\n", (void *)virt_to_phys(ctrlr->sq_buffer[NVME_ADMIN_QUEUE_INDEX]));)
DEBUG(printf("Admin Completion Queue (cq_buffer[ADMIN]) = [%p]\n", (void *)virt_to_phys(ctrlr->cq_buffer[NVME_ADMIN_QUEUE_INDEX]));)
DEBUG(printf("I/O Submission Queue (sq_buffer[NVME_IO_QUEUE]) = [%p]\n", (void *)virt_to_phys(ctrlr->sq_buffer[NVME_IO_QUEUE_INDEX]));)
DEBUG(printf("I/O Completion Queue (cq_buffer[NVME_IO_QUEUE]) = [%p]\n", (void *)virt_to_phys(ctrlr->cq_buffer[NVME_IO_QUEUE_INDEX]));)
/* Write AQA */
writel(aqa, ctrlr->ctrlr_regs + NVME_AQA_OFFSET);
/* Write ASQ */
writell(asq, ctrlr->ctrlr_regs + NVME_ASQ_OFFSET);
/* Write ACQ */
writell(acq, ctrlr->ctrlr_regs + NVME_ACQ_OFFSET);
/* Enable controller */
status = nvme_enable_controller(ctrlr);
if (NVME_ERROR(status))
goto exit;
/* Create IO queue pair */
status = nvme_create_cq(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iocq_sz);
if (NVME_ERROR(status))
goto exit;
status = nvme_create_sq(ctrlr, NVME_IO_QUEUE_INDEX, ctrlr->iosq_sz);
if (NVME_ERROR(status))
goto exit;
/* Identify */
status = nvme_identify(ctrlr);
if (NVME_ERROR(status))
goto exit;
/* Identify Namespace and create drive nodes */
status = nvme_identify_namespaces(ctrlr);
if (NVME_ERROR(status))
goto exit;
exit:
ctrlr->ctrlr.need_update = 0;
return NVME_ERROR(status);
}
static int nvme_shutdown(struct CleanupFunc *cleanup, CleanupType type)
{
NvmeCtrlr *ctrlr = (NvmeCtrlr *)cleanup->data;
NvmeDrive *drive;
int status = NVME_SUCCESS;
printf("Shutting down NVMe controller.\n");
if (NULL == ctrlr)
return 1;
/* Only disable controller if initialized */
if (ctrlr->ctrlr.need_update != 1) {
status = nvme_disable_controller(ctrlr);
if (NVME_ERROR(status))
return 1;
}
list_for_each(drive, ctrlr->drives, list_node) {
free(drive);
}
free(ctrlr->controller_data);
free(ctrlr->prp_list);
free(ctrlr->buffer);
free(ctrlr);
return 0;
}
/* Setup controller initialization/shutdown callbacks.
* Used in board.c to get handle to new ctrlr.
*/
NvmeCtrlr *new_nvme_ctrlr(pcidev_t dev)
{
NvmeCtrlr *ctrlr = xzalloc(sizeof(*ctrlr));
static CleanupFunc cleanup = {
&nvme_shutdown,
CleanupOnHandoff | CleanupOnLegacy,
NULL
};
assert(cleanup.data == NULL);
printf("New NVMe Controller %p @ %02x:%02x:%02x\n",
ctrlr, PCI_BUS(dev),PCI_SLOT(dev),PCI_FUNC(dev));
ctrlr->ctrlr.ops.update = &nvme_ctrlr_init;
ctrlr->ctrlr.need_update = 1;
ctrlr->dev = dev;
cleanup.data = (void *)ctrlr;
list_insert_after(&cleanup.list_node, &cleanup_funcs);
return ctrlr;
}