diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d58a6e2a4ab1..dbb661b899ef 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -465,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->device, "starting error recovery\n"); queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } @@ -2155,40 +2156,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) nvme_tcp_queue_request(&ctrl->async_req, true, true); } +static void nvme_tcp_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); +} + static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq, bool reserved) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - - dev_warn(ctrl->ctrl.device, + dev_warn(ctrl->device, "queue %d: timeout request %#x type %d\n", nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + if (ctrl->state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); - nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); + nvme_tcp_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); - nvme_tcp_error_recovery(&ctrl->ctrl); - + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); return BLK_EH_RESET_TIMER; }