2 * Copyright (C) 2012 Intel Corporation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #include <sys/types.h>
36 #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command)
37 #define NVME_RESET_CONTROLLER _IO('n', 1)
39 #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test)
40 #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test)
43 * Use to mark a command to apply to all namespaces, or to retrieve global
46 #define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF)
48 union cap_lo_register {
51 /** maximum queue entries supported */
54 /** contiguous queues required */
57 /** arbitration mechanism supported */
60 uint32_t reserved1 : 5;
67 union cap_hi_register {
70 /** doorbell stride */
73 uint32_t reserved3 : 1;
75 /** command sets supported */
78 uint32_t css_reserved : 3;
79 uint32_t reserved2 : 7;
81 /** memory page size minimum */
84 /** memory page size maximum */
87 uint32_t reserved1 : 8;
97 uint32_t reserved1 : 3;
99 /** i/o command set selected */
102 /** memory page size */
105 /** arbitration mechanism selected */
108 /** shutdown notification */
111 /** i/o submission queue entry size */
114 /** i/o completion queue entry size */
117 uint32_t reserved2 : 8;
122 NVME_SHN_NORMAL = 0x1,
123 NVME_SHN_ABRUPT = 0x2,
126 union csts_register {
132 /** controller fatal status */
135 /** shutdown status */
138 uint32_t reserved1 : 28;
143 NVME_SHST_NORMAL = 0x0,
144 NVME_SHST_OCCURRING = 0x1,
145 NVME_SHST_COMPLETE = 0x2,
151 /** admin submission queue size */
154 uint32_t reserved1 : 4;
156 /** admin completion queue size */
159 uint32_t reserved2 : 4;
163 struct nvme_registers
165 /** controller capabilities */
166 union cap_lo_register cap_lo;
167 union cap_hi_register cap_hi;
169 uint32_t vs; /* version */
170 uint32_t intms; /* interrupt mask set */
171 uint32_t intmc; /* interrupt mask clear */
173 /** controller configuration */
174 union cc_register cc;
177 uint32_t csts; /* controller status */
180 /** admin queue attributes */
181 union aqa_register aqa;
183 uint64_t asq; /* admin submission queue base addr */
184 uint64_t acq; /* admin completion queue base addr */
185 uint32_t reserved3[0x3f2];
188 uint32_t sq_tdbl; /* submission queue tail doorbell */
189 uint32_t cq_hdbl; /* completion queue head doorbell */
190 } doorbell[1] __packed;
196 uint16_t opc : 8; /* opcode */
197 uint16_t fuse : 2; /* fused operation */
199 uint16_t cid; /* command identifier */
202 uint32_t nsid; /* namespace identifier */
209 uint64_t mptr; /* metadata pointer */
212 uint64_t prp1; /* prp entry 1 */
215 uint64_t prp2; /* prp entry 2 */
218 uint32_t cdw10; /* command-specific */
219 uint32_t cdw11; /* command-specific */
220 uint32_t cdw12; /* command-specific */
221 uint32_t cdw13; /* command-specific */
222 uint32_t cdw14; /* command-specific */
223 uint32_t cdw15; /* command-specific */
228 uint16_t p : 1; /* phase tag */
229 uint16_t sc : 8; /* status code */
230 uint16_t sct : 3; /* status code type */
232 uint16_t m : 1; /* more */
233 uint16_t dnr : 1; /* do not retry */
236 struct nvme_completion {
239 uint32_t cdw0; /* command-specific */
245 uint16_t sqhd; /* submission queue head pointer */
246 uint16_t sqid; /* submission queue identifier */
249 uint16_t cid; /* command identifier */
250 struct nvme_status status;
253 struct nvme_dsm_range {
257 uint64_t starting_lba;
260 /* status code types */
261 enum nvme_status_code_type {
262 NVME_SCT_GENERIC = 0x0,
263 NVME_SCT_COMMAND_SPECIFIC = 0x1,
264 NVME_SCT_MEDIA_ERROR = 0x2,
265 /* 0x3-0x6 - reserved */
266 NVME_SCT_VENDOR_SPECIFIC = 0x7,
269 /* generic command status codes */
270 enum nvme_generic_command_status_code {
271 NVME_SC_SUCCESS = 0x00,
272 NVME_SC_INVALID_OPCODE = 0x01,
273 NVME_SC_INVALID_FIELD = 0x02,
274 NVME_SC_COMMAND_ID_CONFLICT = 0x03,
275 NVME_SC_DATA_TRANSFER_ERROR = 0x04,
276 NVME_SC_ABORTED_POWER_LOSS = 0x05,
277 NVME_SC_INTERNAL_DEVICE_ERROR = 0x06,
278 NVME_SC_ABORTED_BY_REQUEST = 0x07,
279 NVME_SC_ABORTED_SQ_DELETION = 0x08,
280 NVME_SC_ABORTED_FAILED_FUSED = 0x09,
281 NVME_SC_ABORTED_MISSING_FUSED = 0x0a,
282 NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b,
283 NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c,
285 NVME_SC_LBA_OUT_OF_RANGE = 0x80,
286 NVME_SC_CAPACITY_EXCEEDED = 0x81,
287 NVME_SC_NAMESPACE_NOT_READY = 0x82,
290 /* command specific status codes */
291 enum nvme_command_specific_status_code {
292 NVME_SC_COMPLETION_QUEUE_INVALID = 0x00,
293 NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01,
294 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02,
295 NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03,
296 /* 0x04 - reserved */
297 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05,
298 NVME_SC_INVALID_FIRMWARE_SLOT = 0x06,
299 NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07,
300 NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08,
301 NVME_SC_INVALID_LOG_PAGE = 0x09,
302 NVME_SC_INVALID_FORMAT = 0x0a,
303 NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b,
305 NVME_SC_CONFLICTING_ATTRIBUTES = 0x80,
306 NVME_SC_INVALID_PROTECTION_INFO = 0x81,
307 NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82,
310 /* media error status codes */
311 enum nvme_media_error_status_code {
312 NVME_SC_WRITE_FAULTS = 0x80,
313 NVME_SC_UNRECOVERED_READ_ERROR = 0x81,
314 NVME_SC_GUARD_CHECK_ERROR = 0x82,
315 NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83,
316 NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84,
317 NVME_SC_COMPARE_FAILURE = 0x85,
318 NVME_SC_ACCESS_DENIED = 0x86,
322 enum nvme_admin_opcode {
323 NVME_OPC_DELETE_IO_SQ = 0x00,
324 NVME_OPC_CREATE_IO_SQ = 0x01,
325 NVME_OPC_GET_LOG_PAGE = 0x02,
326 /* 0x03 - reserved */
327 NVME_OPC_DELETE_IO_CQ = 0x04,
328 NVME_OPC_CREATE_IO_CQ = 0x05,
329 NVME_OPC_IDENTIFY = 0x06,
330 /* 0x07 - reserved */
331 NVME_OPC_ABORT = 0x08,
332 NVME_OPC_SET_FEATURES = 0x09,
333 NVME_OPC_GET_FEATURES = 0x0a,
334 /* 0x0b - reserved */
335 NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c,
336 /* 0x0d-0x0f - reserved */
337 NVME_OPC_FIRMWARE_ACTIVATE = 0x10,
338 NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11,
340 NVME_OPC_FORMAT_NVM = 0x80,
341 NVME_OPC_SECURITY_SEND = 0x81,
342 NVME_OPC_SECURITY_RECEIVE = 0x82,
345 /* nvme nvm opcodes */
346 enum nvme_nvm_opcode {
347 NVME_OPC_FLUSH = 0x00,
348 NVME_OPC_WRITE = 0x01,
349 NVME_OPC_READ = 0x02,
350 /* 0x03 - reserved */
351 NVME_OPC_WRITE_UNCORRECTABLE = 0x04,
352 NVME_OPC_COMPARE = 0x05,
353 /* 0x06-0x07 - reserved */
354 NVME_OPC_DATASET_MANAGEMENT = 0x09,
358 /* 0x00 - reserved */
359 NVME_FEAT_ARBITRATION = 0x01,
360 NVME_FEAT_POWER_MANAGEMENT = 0x02,
361 NVME_FEAT_LBA_RANGE_TYPE = 0x03,
362 NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04,
363 NVME_FEAT_ERROR_RECOVERY = 0x05,
364 NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06,
365 NVME_FEAT_NUMBER_OF_QUEUES = 0x07,
366 NVME_FEAT_INTERRUPT_COALESCING = 0x08,
367 NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09,
368 NVME_FEAT_WRITE_ATOMICITY = 0x0A,
369 NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B,
370 /* 0x0C-0x7F - reserved */
371 NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80,
372 /* 0x81-0xBF - command set specific (reserved) */
373 /* 0xC0-0xFF - vendor specific */
376 enum nvme_dsm_attribute {
377 NVME_DSM_ATTR_INTEGRAL_READ = 0x1,
378 NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2,
379 NVME_DSM_ATTR_DEALLOCATE = 0x4,
382 struct nvme_controller_data {
384 /* bytes 0-255: controller capabilities and features */
389 /** pci subsystem vendor id */
398 /** firmware revision */
401 /** recommended arbitration burst */
404 /** ieee oui identifier */
407 /** multi-interface capabilities */
410 /** maximum data transfer size */
413 uint8_t reserved1[178];
415 /* bytes 256-511: admin command set attributes */
417 /** optional admin command support */
419 /* supports security send/receive commands */
420 uint16_t security : 1;
422 /* supports format nvm command */
425 /* supports firmware activate/download commands */
426 uint16_t firmware : 1;
428 uint16_t oacs_rsvd : 13;
431 /** abort command limit */
434 /** asynchronous event request limit */
437 /** firmware updates */
439 /* first slot is read-only */
440 uint8_t slot1_ro : 1;
442 /* number of firmware slots */
443 uint8_t num_slots : 3;
445 uint8_t frmw_rsvd : 4;
448 /** log page attributes */
450 /* per namespace smart/health log page */
451 uint8_t ns_smart : 1;
453 uint8_t lpa_rsvd : 7;
456 /** error log page entries */
459 /** number of power states supported */
462 /** admin vendor specific command configuration */
464 /* admin vendor specific commands use spec format */
465 uint8_t spec_format : 1;
467 uint8_t avscc_rsvd : 7;
470 uint8_t reserved2[247];
472 /* bytes 512-703: nvm command set attributes */
474 /** submission queue entry size */
480 /** completion queue entry size */
486 uint8_t reserved3[2];
488 /** number of namespaces */
491 /** optional nvm command support */
493 uint16_t compare : 1;
494 uint16_t write_unc : 1;
496 uint16_t reserved: 13;
499 /** fused operation support */
502 /** format nvm attributes */
505 /** volatile write cache */
508 uint8_t reserved : 7;
511 /* TODO: flesh out remaining nvm command set attributes */
512 uint8_t reserved4[178];
514 /* bytes 704-2047: i/o command set attributes */
515 uint8_t reserved5[1344];
517 /* bytes 2048-3071: power state descriptors */
518 uint8_t reserved6[1024];
520 /* bytes 3072-4095: vendor specific */
521 uint8_t reserved7[1024];
522 } __packed __aligned(4);
524 struct nvme_namespace_data {
526 /** namespace size */
529 /** namespace capacity */
532 /** namespace utilization */
535 /** namespace features */
537 /** thin provisioning */
538 uint8_t thin_prov : 1;
539 uint8_t reserved1 : 7;
542 /** number of lba formats */
545 /** formatted lba size */
548 uint8_t extended : 1;
549 uint8_t reserved2 : 3;
552 /** metadata capabilities */
554 /* metadata can be transferred as part of data prp list */
555 uint8_t extended : 1;
557 /* metadata can be transferred with separate metadata pointer */
560 uint8_t reserved3 : 6;
563 /** end-to-end data protection capabilities */
565 /* protection information type 1 */
568 /* protection information type 2 */
571 /* protection information type 3 */
574 /* first eight bytes of metadata */
575 uint8_t md_start : 1;
577 /* last eight bytes of metadata */
581 /** end-to-end data protection type settings */
583 /* protection information type */
586 /* 1 == protection info transferred at start of metadata */
587 /* 0 == protection info transferred at end of metadata */
588 uint8_t md_start : 1;
590 uint8_t reserved4 : 4;
593 uint8_t reserved5[98];
595 /** lba format support */
603 /** relative performance */
606 uint32_t reserved6 : 6;
609 uint8_t reserved6[192];
611 uint8_t vendor_specific[3712];
612 } __packed __aligned(4);
616 /* 0x00 - reserved */
617 NVME_LOG_ERROR = 0x01,
618 NVME_LOG_HEALTH_INFORMATION = 0x02,
619 NVME_LOG_FIRMWARE_SLOT = 0x03,
620 /* 0x04-0x7F - reserved */
621 /* 0x80-0xBF - I/O command set specific */
622 /* 0xC0-0xFF - vendor specific */
625 struct nvme_error_information_entry {
627 uint64_t error_count;
630 struct nvme_status status;
631 uint16_t error_location;
634 uint8_t vendor_specific;
635 uint8_t reserved[35];
636 } __packed __aligned(4);
638 union nvme_critical_warning_state {
643 uint8_t available_spare : 1;
644 uint8_t temperature : 1;
645 uint8_t device_reliability : 1;
646 uint8_t read_only : 1;
647 uint8_t volatile_memory_backup : 1;
648 uint8_t reserved : 3;
652 struct nvme_health_information_page {
654 union nvme_critical_warning_state critical_warning;
656 uint16_t temperature;
657 uint8_t available_spare;
658 uint8_t available_spare_threshold;
659 uint8_t percentage_used;
661 uint8_t reserved[26];
664 * Note that the following are 128-bit values, but are
665 * defined as an array of 2 64-bit values.
667 /* Data Units Read is always in 512-byte units. */
668 uint64_t data_units_read[2];
669 /* Data Units Written is always in 512-byte units. */
670 uint64_t data_units_written[2];
671 /* For NVM command set, this includes Compare commands. */
672 uint64_t host_read_commands[2];
673 uint64_t host_write_commands[2];
674 /* Controller Busy Time is reported in minutes. */
675 uint64_t controller_busy_time[2];
676 uint64_t power_cycles[2];
677 uint64_t power_on_hours[2];
678 uint64_t unsafe_shutdowns[2];
679 uint64_t media_errors[2];
680 uint64_t num_error_info_log_entries[2];
682 uint8_t reserved2[320];
683 } __packed __aligned(4);
685 struct nvme_firmware_page {
688 uint8_t slot : 3; /* slot for current FW */
689 uint8_t reserved : 5;
693 uint64_t revision[7]; /* revisions for 7 slots */
694 uint8_t reserved2[448];
695 } __packed __aligned(4);
697 #define NVME_TEST_MAX_THREADS 128
699 struct nvme_io_test {
701 enum nvme_nvm_opcode opc;
703 uint32_t time; /* in seconds */
704 uint32_t num_threads;
706 uint32_t io_completed[NVME_TEST_MAX_THREADS];
709 enum nvme_io_test_flags {
712 * Specifies whether dev_refthread/dev_relthread should be
713 * called during NVME_BIO_TEST. Ignored for other test
716 NVME_TEST_FLAG_REFTHREAD = 0x1,
719 struct nvme_pt_command {
722 * cmd is used to specify a passthrough command to a controller or
725 * The following fields from cmd may be specified by the caller:
727 * * nsid (namespace id) - for admin commands only
730 * Remaining fields must be set to 0 by the caller.
732 struct nvme_command cmd;
735 * cpl returns completion status for the passthrough command
738 * The following fields will be filled out by the driver, for
739 * consumption by the caller:
741 * * status (except for phase)
743 * Remaining fields will be set to 0 by the driver.
745 struct nvme_completion cpl;
747 /* buf is the data buffer associated with this passthrough command. */
751 * len is the length of the data buffer associated with this
752 * passthrough command.
757 * is_read = 1 if the passthrough command will read data into the
760 * is_read = 0 if the passthrough command will write data into the
766 * driver_lock is used by the driver only. It must be set to 0
769 struct mtx * driver_lock;
772 #define nvme_completion_is_error(cpl) \
773 ((cpl)->status.sc != 0 || (cpl)->status.sct != 0)
779 struct nvme_namespace;
780 struct nvme_controller;
781 struct nvme_consumer;
783 typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
785 typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *);
786 typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *);
787 typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *,
788 uint32_t, void *, uint32_t);
789 typedef void (*nvme_cons_fail_fn_t)(void *);
791 enum nvme_namespace_flags {
792 NVME_NS_DEALLOCATE_SUPPORTED = 0x1,
793 NVME_NS_FLUSH_SUPPORTED = 0x2,
796 int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
797 struct nvme_pt_command *pt,
798 uint32_t nsid, int is_user_buffer,
801 /* Admin functions */
802 void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
803 uint8_t feature, uint32_t cdw11,
804 void *payload, uint32_t payload_size,
805 nvme_cb_fn_t cb_fn, void *cb_arg);
806 void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr,
807 uint8_t feature, uint32_t cdw11,
808 void *payload, uint32_t payload_size,
809 nvme_cb_fn_t cb_fn, void *cb_arg);
810 void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr,
811 uint8_t log_page, uint32_t nsid,
812 void *payload, uint32_t payload_size,
813 nvme_cb_fn_t cb_fn, void *cb_arg);
815 /* NVM I/O functions */
816 int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload,
817 uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
819 int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp,
820 nvme_cb_fn_t cb_fn, void *cb_arg);
821 int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload,
822 uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
824 int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp,
825 nvme_cb_fn_t cb_fn, void *cb_arg);
826 int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
827 uint8_t num_ranges, nvme_cb_fn_t cb_fn,
829 int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn,
832 /* Registration functions */
833 struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn,
834 nvme_cons_ctrlr_fn_t ctrlr_fn,
835 nvme_cons_async_fn_t async_fn,
836 nvme_cons_fail_fn_t fail_fn);
837 void nvme_unregister_consumer(struct nvme_consumer *consumer);
839 /* Controller helper functions */
840 device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr);
841 const struct nvme_controller_data *
842 nvme_ctrlr_get_data(struct nvme_controller *ctrlr);
844 /* Namespace helper functions */
845 uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns);
846 uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns);
847 uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns);
848 uint64_t nvme_ns_get_size(struct nvme_namespace *ns);
849 uint32_t nvme_ns_get_flags(struct nvme_namespace *ns);
850 const char * nvme_ns_get_serial_number(struct nvme_namespace *ns);
851 const char * nvme_ns_get_model_number(struct nvme_namespace *ns);
852 const struct nvme_namespace_data *
853 nvme_ns_get_data(struct nvme_namespace *ns);
855 int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
860 #endif /* __NVME_H__ */