Skip to content

Commit cbcfe20

Browse files
committed
nvme-pci: use NSSR to avoid CAP.TO and optimize timeout handling
Reduce hung device recovery time using NSSR and optimized timeouts: - Skip abort commands and go directly to reset (following FreeBSD) - Trigger NSSR on hotplug-capable slots for fast device removal - Skip explicit queue deletion during reset (saves admin_timeout) - Detect hung devices after first CAP.TO by checking controller state - Skip re-enable in reset_work for hung devices (saves second CAP.TO) - Remove namespaces immediately for hung devices after DEAD state NSSR path (NSSR supported + hotplug-capable): After io_timeout (30s), trigger NSSR which causes PCIe link down. pciehp detects link state change and calls nvme_remove() for immediate device removal. Total recovery time: ~30s. Non-NSSR path (NSSR not supported or non-hotplug slot): After io_timeout (30s), proceed with controller reset. Hung device detection after first CAP.TO prevents second CAP.TO wait in reset_work. With default CAP.TO (~45s), total recovery time: ~75s. Previously, hung device recovery took io_timeout×2 + admin_timeout + CAP.TO×2. With defaults (io_timeout=30s, admin_timeout=60s, CAP.TO=45s), this was ~210s. Namespace removal for hung devices cleans up stale block devices that previously remained visible despite dead controller, enabling proper drive replacement workflows. Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
1 parent 700012f commit cbcfe20

File tree

1 file changed

+122
-1
lines changed

1 file changed

+122
-1
lines changed

drivers/nvme/host/pci.c

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ struct nvme_queue;
138138
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
139139
static void nvme_delete_io_queues(struct nvme_dev *dev);
140140
static void nvme_update_attrs(struct nvme_dev *dev);
141+
static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev);
141142

142143
struct nvme_descriptor_pools {
143144
struct dma_pool *large;
@@ -171,6 +172,13 @@ struct nvme_dev {
171172
struct nvme_ctrl ctrl;
172173
u32 last_ps;
173174
bool hmb;
175+
#ifdef CONFIG_TRUENAS
176+
bool hung_device;
177+
178+
/* NSSR tracking */
179+
bool nssr_pending;
180+
struct completion nssr_done;
181+
#endif
174182
struct sg_table *hmb_sgt;
175183
mempool_t *dmavec_mempool;
176184

@@ -1617,6 +1625,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
16171625
struct pci_dev *pdev = to_pci_dev(dev->dev);
16181626
u32 csts = readl(dev->bar + NVME_REG_CSTS);
16191627
u8 opcode;
1628+
#ifdef CONFIG_TRUENAS
1629+
bool try_nssr = false;
1630+
#endif
16201631

16211632
/*
16221633
* Shutdown the device immediately if we see it is disconnected. This
@@ -1690,6 +1701,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
16901701
* returned to the driver, or if this is the admin queue.
16911702
*/
16921703
opcode = nvme_req(req)->cmd->common.opcode;
1704+
#ifdef CONFIG_TRUENAS
1705+
/*
1706+
* Skip abort and go directly to controller reset, following FreeBSD.
1707+
* I/O timeout often indicates controller-level issues where aborts
1708+
* may not work. Reduces recovery time from 60s to 30s for hung devices.
1709+
*/
1710+
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1711+
try_nssr = true;
1712+
goto disable;
1713+
#endif
16931714
if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
16941715
dev_warn(dev->ctrl.device,
16951716
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
@@ -1741,7 +1762,57 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
17411762
return BLK_EH_DONE;
17421763
}
17431764

1765+
#ifdef CONFIG_TRUENAS
1766+
if (try_nssr && dev->subsystem && pdev->bus->self &&
1767+
pdev->bus->self->is_hotplug_bridge) {
1768+
bool nssr_ok = false;
1769+
1770+
mutex_lock(&dev->shutdown_lock);
1771+
if (!dev->nssr_pending) {
1772+
init_completion(&dev->nssr_done);
1773+
dev->nssr_pending = true;
1774+
writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
1775+
nssr_ok = true;
1776+
}
1777+
mutex_unlock(&dev->shutdown_lock);
1778+
1779+
if (nssr_ok &&
1780+
wait_for_completion_timeout(&dev->nssr_done, 5 * HZ)) {
1781+
dev_warn(dev->ctrl.device,
1782+
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, NSSR\n",
1783+
req->tag, nvme_cid(req), opcode,
1784+
nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
1785+
return BLK_EH_DONE;
1786+
}
1787+
1788+
if (nssr_ok) {
1789+
mutex_lock(&dev->shutdown_lock);
1790+
dev->nssr_pending = false;
1791+
mutex_unlock(&dev->shutdown_lock);
1792+
}
1793+
}
1794+
1795+
if (try_nssr)
1796+
dev_warn(dev->ctrl.device,
1797+
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
1798+
req->tag, nvme_cid(req), opcode,
1799+
nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
1800+
#endif
1801+
17441802
nvme_dev_disable(dev, false);
1803+
1804+
#ifdef CONFIG_TRUENAS
1805+
/*
1806+
* Detect hung device. If device does not respond after
1807+
* io + abort + CAP.TO timeout, no point of trying again
1808+
* in reset work which would cause another CAP.TO wait.
1809+
*/
1810+
mutex_lock(&dev->shutdown_lock);
1811+
if (!nvme_pci_ctrl_is_dead(dev))
1812+
dev->hung_device = true;
1813+
mutex_unlock(&dev->shutdown_lock);
1814+
#endif
1815+
17451816
if (nvme_try_sched_reset(&dev->ctrl))
17461817
nvme_unquiesce_io_queues(&dev->ctrl);
17471818
return BLK_EH_DONE;
@@ -3026,7 +3097,18 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
30263097
nvme_quiesce_io_queues(&dev->ctrl);
30273098

30283099
if (!dead && dev->ctrl.queue_count > 0) {
3100+
#ifdef CONFIG_TRUENAS
3101+
/*
3102+
* Skip queue deletion during reset - controller disable
3103+
* deletes all queues per spec. Only do explicit deletion
3104+
* during clean shutdown. This also avoids admin_timeout
3105+
* wait if delete commands hang on an unresponsive device.
3106+
*/
3107+
if (shutdown)
3108+
nvme_delete_io_queues(dev);
3109+
#else
30293110
nvme_delete_io_queues(dev);
3111+
#endif
30303112
nvme_disable_ctrl(&dev->ctrl, shutdown);
30313113
nvme_poll_irqdisable(&dev->queues[0]);
30323114
}
@@ -3099,13 +3181,29 @@ static void nvme_reset_work(struct work_struct *work)
30993181
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
31003182
int result;
31013183

3184+
#ifdef CONFIG_TRUENAS
3185+
bool was_hung = dev->hung_device;
3186+
dev->hung_device = false;
3187+
#endif
3188+
31023189
if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
31033190
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
31043191
dev->ctrl.state);
31053192
result = -ENODEV;
31063193
goto out;
31073194
}
31083195

3196+
#ifdef CONFIG_TRUENAS
3197+
/*
3198+
* If device was hung (didn't respond after io + abort + CAP.TO),
3199+
* skip re-enable to avoid another CAP.TO wait.
3200+
*/
3201+
if (was_hung) {
3202+
result = -ENODEV;
3203+
goto out;
3204+
}
3205+
#endif
3206+
31093207
/*
31103208
* If we're called to reset a live controller first shut it down before
31113209
* moving on.
@@ -3202,6 +3300,12 @@ static void nvme_reset_work(struct work_struct *work)
32023300
nvme_mark_namespaces_dead(&dev->ctrl);
32033301
nvme_unquiesce_io_queues(&dev->ctrl);
32043302
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
3303+
3304+
#ifdef CONFIG_TRUENAS
3305+
/* Remove namespaces for hung devices after DEAD state */
3306+
if (was_hung)
3307+
nvme_remove_namespaces(&dev->ctrl);
3308+
#endif
32053309
}
32063310

32073311
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
@@ -3568,19 +3672,36 @@ static void nvme_shutdown(struct pci_dev *pdev)
35683672
static void nvme_remove(struct pci_dev *pdev)
35693673
{
35703674
struct nvme_dev *dev = pci_get_drvdata(pdev);
3675+
bool dead = false;
3676+
3677+
#ifdef CONFIG_TRUENAS
3678+
mutex_lock(&dev->shutdown_lock);
3679+
dead = dev->nssr_pending;
3680+
if (dead) {
3681+
dev->nssr_pending = false;
3682+
complete(&dev->nssr_done);
3683+
}
3684+
mutex_unlock(&dev->shutdown_lock);
3685+
#endif
35713686

35723687
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
35733688
pci_set_drvdata(pdev, NULL);
35743689

3575-
if (!pci_device_is_present(pdev)) {
3690+
dead |= !pci_device_is_present(pdev);
3691+
if (dead) {
35763692
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
35773693
nvme_dev_disable(dev, true);
35783694
}
35793695

35803696
flush_work(&dev->ctrl.reset_work);
35813697
nvme_stop_ctrl(&dev->ctrl);
35823698
nvme_remove_namespaces(&dev->ctrl);
3699+
#if defined(CONFIG_TRUENAS)
3700+
if (!dead)
3701+
nvme_dev_disable(dev, true);
3702+
#else
35833703
nvme_dev_disable(dev, true);
3704+
#endif
35843705
nvme_free_host_mem(dev);
35853706
nvme_dev_remove_admin(dev);
35863707
nvme_dbbuf_dma_free(dev);

0 commit comments

Comments
 (0)