@@ -138,6 +138,7 @@ struct nvme_queue;
138138static void nvme_dev_disable (struct nvme_dev * dev , bool shutdown );
139139static void nvme_delete_io_queues (struct nvme_dev * dev );
140140static void nvme_update_attrs (struct nvme_dev * dev );
141+ static bool nvme_pci_ctrl_is_dead (struct nvme_dev * dev );
141142
142143struct nvme_descriptor_pools {
143144 struct dma_pool * large ;
@@ -171,6 +172,13 @@ struct nvme_dev {
171172 struct nvme_ctrl ctrl ;
172173 u32 last_ps ;
173174 bool hmb ;
175+ #ifdef CONFIG_TRUENAS
176+ bool hung_device ;
177+
178+ /* NSSR tracking */
179+ bool nssr_pending ;
180+ struct completion nssr_done ;
181+ #endif
174182 struct sg_table * hmb_sgt ;
175183 mempool_t * dmavec_mempool ;
176184
@@ -1617,6 +1625,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
16171625 struct pci_dev * pdev = to_pci_dev (dev -> dev );
16181626 u32 csts = readl (dev -> bar + NVME_REG_CSTS );
16191627 u8 opcode ;
1628+ #ifdef CONFIG_TRUENAS
1629+ bool try_nssr = false;
1630+ #endif
16201631
16211632 /*
16221633 * Shutdown the device immediately if we see it is disconnected. This
@@ -1690,6 +1701,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
16901701 * returned to the driver, or if this is the admin queue.
16911702 */
16921703 opcode = nvme_req (req )-> cmd -> common .opcode ;
1704+ #ifdef CONFIG_TRUENAS
1705+ /*
1706+ * Skip abort and go directly to controller reset, following FreeBSD.
1707+ * I/O timeout often indicates controller-level issues where aborts
1708+ * may not work. Reduces recovery time from 60s to 30s for hung devices.
1709+ */
1710+ nvme_req (req )-> flags |= NVME_REQ_CANCELLED ;
1711+ try_nssr = true;
1712+ goto disable ;
1713+ #endif
16931714 if (!nvmeq -> qid || (iod -> flags & IOD_ABORTED )) {
16941715 dev_warn (dev -> ctrl .device ,
16951716 "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n" ,
@@ -1741,7 +1762,57 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
17411762 return BLK_EH_DONE ;
17421763 }
17431764
1765+ #ifdef CONFIG_TRUENAS
1766+ if (try_nssr && dev -> subsystem && pdev -> bus -> self &&
1767+ pdev -> bus -> self -> is_hotplug_bridge ) {
1768+ bool nssr_ok = false;
1769+
1770+ mutex_lock (& dev -> shutdown_lock );
1771+ if (!dev -> nssr_pending ) {
1772+ init_completion (& dev -> nssr_done );
1773+ dev -> nssr_pending = true;
1774+ writel (NVME_SUBSYS_RESET , dev -> bar + NVME_REG_NSSR );
1775+ nssr_ok = true;
1776+ }
1777+ mutex_unlock (& dev -> shutdown_lock );
1778+
1779+ if (nssr_ok &&
1780+ wait_for_completion_timeout (& dev -> nssr_done , 5 * HZ )) {
1781+ dev_warn (dev -> ctrl .device ,
1782+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, NSSR\n" ,
1783+ req -> tag , nvme_cid (req ), opcode ,
1784+ nvme_opcode_str (nvmeq -> qid , opcode ), nvmeq -> qid );
1785+ return BLK_EH_DONE ;
1786+ }
1787+
1788+ if (nssr_ok ) {
1789+ mutex_lock (& dev -> shutdown_lock );
1790+ dev -> nssr_pending = false;
1791+ mutex_unlock (& dev -> shutdown_lock );
1792+ }
1793+ }
1794+
1795+ if (try_nssr )
1796+ dev_warn (dev -> ctrl .device ,
1797+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n" ,
1798+ req -> tag , nvme_cid (req ), opcode ,
1799+ nvme_opcode_str (nvmeq -> qid , opcode ), nvmeq -> qid );
1800+ #endif
1801+
17441802 nvme_dev_disable (dev , false);
1803+
1804+ #ifdef CONFIG_TRUENAS
1805+ /*
1806+ * Detect hung device. If device does not respond after
1807+ * io + abort + CAP.TO timeout, no point of trying again
1808+ * in reset work which would cause another CAP.TO wait.
1809+ */
1810+ mutex_lock (& dev -> shutdown_lock );
1811+ if (!nvme_pci_ctrl_is_dead (dev ))
1812+ dev -> hung_device = true;
1813+ mutex_unlock (& dev -> shutdown_lock );
1814+ #endif
1815+
17451816 if (nvme_try_sched_reset (& dev -> ctrl ))
17461817 nvme_unquiesce_io_queues (& dev -> ctrl );
17471818 return BLK_EH_DONE ;
@@ -3026,7 +3097,18 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
30263097 nvme_quiesce_io_queues (& dev -> ctrl );
30273098
30283099 if (!dead && dev -> ctrl .queue_count > 0 ) {
3100+ #ifdef CONFIG_TRUENAS
3101+ /*
3102+ * Skip queue deletion during reset - controller disable
3103+ * deletes all queues per spec. Only do explicit deletion
3104+ * during clean shutdown. This also avoids admin_timeout
3105+ * wait if delete commands hang on an unresponsive device.
3106+ */
3107+ if (shutdown )
3108+ nvme_delete_io_queues (dev );
3109+ #else
30293110 nvme_delete_io_queues (dev );
3111+ #endif
30303112 nvme_disable_ctrl (& dev -> ctrl , shutdown );
30313113 nvme_poll_irqdisable (& dev -> queues [0 ]);
30323114 }
@@ -3099,13 +3181,29 @@ static void nvme_reset_work(struct work_struct *work)
30993181 bool was_suspend = !!(dev -> ctrl .ctrl_config & NVME_CC_SHN_NORMAL );
31003182 int result ;
31013183
3184+ #ifdef CONFIG_TRUENAS
3185+ bool was_hung = dev -> hung_device ;
3186+ dev -> hung_device = false;
3187+ #endif
3188+
31023189 if (nvme_ctrl_state (& dev -> ctrl ) != NVME_CTRL_RESETTING ) {
31033190 dev_warn (dev -> ctrl .device , "ctrl state %d is not RESETTING\n" ,
31043191 dev -> ctrl .state );
31053192 result = - ENODEV ;
31063193 goto out ;
31073194 }
31083195
3196+ #ifdef CONFIG_TRUENAS
3197+ /*
3198+ * If device was hung (didn't respond after io + abort + CAP.TO),
3199+ * skip re-enable to avoid another CAP.TO wait.
3200+ */
3201+ if (was_hung ) {
3202+ result = - ENODEV ;
3203+ goto out ;
3204+ }
3205+ #endif
3206+
31093207 /*
31103208 * If we're called to reset a live controller first shut it down before
31113209 * moving on.
@@ -3202,6 +3300,12 @@ static void nvme_reset_work(struct work_struct *work)
32023300 nvme_mark_namespaces_dead (& dev -> ctrl );
32033301 nvme_unquiesce_io_queues (& dev -> ctrl );
32043302 nvme_change_ctrl_state (& dev -> ctrl , NVME_CTRL_DEAD );
3303+
3304+ #ifdef CONFIG_TRUENAS
3305+ /* Remove namespaces for hung devices after DEAD state */
3306+ if (was_hung )
3307+ nvme_remove_namespaces (& dev -> ctrl );
3308+ #endif
32053309}
32063310
32073311static int nvme_pci_reg_read32 (struct nvme_ctrl * ctrl , u32 off , u32 * val )
@@ -3568,19 +3672,36 @@ static void nvme_shutdown(struct pci_dev *pdev)
35683672static void nvme_remove (struct pci_dev * pdev )
35693673{
35703674 struct nvme_dev * dev = pci_get_drvdata (pdev );
3675+ bool dead = false;
3676+
3677+ #ifdef CONFIG_TRUENAS
3678+ mutex_lock (& dev -> shutdown_lock );
3679+ dead = dev -> nssr_pending ;
3680+ if (dead ) {
3681+ dev -> nssr_pending = false;
3682+ complete (& dev -> nssr_done );
3683+ }
3684+ mutex_unlock (& dev -> shutdown_lock );
3685+ #endif
35713686
35723687 nvme_change_ctrl_state (& dev -> ctrl , NVME_CTRL_DELETING );
35733688 pci_set_drvdata (pdev , NULL );
35743689
3575- if (!pci_device_is_present (pdev )) {
3690+ dead |= !pci_device_is_present (pdev );
3691+ if (dead ) {
35763692 nvme_change_ctrl_state (& dev -> ctrl , NVME_CTRL_DEAD );
35773693 nvme_dev_disable (dev , true);
35783694 }
35793695
35803696 flush_work (& dev -> ctrl .reset_work );
35813697 nvme_stop_ctrl (& dev -> ctrl );
35823698 nvme_remove_namespaces (& dev -> ctrl );
3699+ #if defined(CONFIG_TRUENAS )
3700+ if (!dead )
3701+ nvme_dev_disable (dev , true);
3702+ #else
35833703 nvme_dev_disable (dev , true);
3704+ #endif
35843705 nvme_free_host_mem (dev );
35853706 nvme_dev_remove_admin (dev );
35863707 nvme_dbbuf_dma_free (dev );
0 commit comments